# Import wrangling packages
import os
import math
import pandas as pd
import numpy as np
import psycopg2
import itertools
import warnings
from scipy.stats import pearsonr
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import re
import calendar
import statsmodels.formula.api as smf
# Import visualization packages
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.patches as mpatches
import matplotlib.style as style
import seaborn as sns
#import geopandas as gpd
#from shapely.geometry import Point, Polygon
# Establish connection
conn = psycopg2.connect(
host='covid19db.org',
port=5432,
dbname='covid19',
user='covid19',
password='covid19')
cur = conn.cursor()
# Declaring which variables to use
vars_to_use = ['Population density (people per sq. km of land area)', 'Population ages 65 and above, total', 'Air transport, passengers carried', 'Current health expenditure (% of GDP)', 'GDP per capita (constant 2010 US$)', 'Population, total']
# Add quotation characters
quote = "\'"
vars_to_use2 = [quote + x + quote for x in vars_to_use]
# Get total number of cases
sql_command = f"""
SELECT country, indicator_name, value, year
FROM world_bank
WHERE indicator_name IN ({', '.join(vars_to_use2)}) AND source = 'World Bank'
"""
indicators_wb = pd.read_sql(sql_command, conn)
for var in vars_to_use:
print(f"\nVariable {var}")
display(indicators_wb.loc[indicators_wb.indicator_name == var, 'year'].value_counts())
Variable Population density (people per sq. km of land area)
2018 206 2017 4 1960 2 2019 2 2011 1 Name: year, dtype: int64
Variable Population ages 65 and above, total
2019 192 1960 22 2011 1 Name: year, dtype: int64
Variable Air transport, passengers carried
2018 154 1960 30 2004 4 2009 4 2014 4 1991 3 2016 3 1998 2 1999 2 2000 2 2017 2 1992 1 1996 1 2001 1 2012 1 2015 1 Name: year, dtype: int64
Variable Current health expenditure (% of GDP)
2017 185 1960 27 2011 1 2012 1 2015 1 Name: year, dtype: int64
Variable GDP per capita (constant 2010 US$)
2019 177 2018 16 1960 10 2010 4 2017 4 2011 1 2013 1 2014 1 2015 1 Name: year, dtype: int64
Variable Population, total
2019 214 2011 1 Name: year, dtype: int64
indicator_years = {'Population density (people per sq. km of land area)': 2018,
'Population ages 65 and above, total': 2019,
'Air transport, passengers carried': 2018,
'Current health expenditure (% of GDP)': 2017,
'GDP per capita (constant 2010 US$)': 2019,
'Population, total': 2019}
for key, year in indicator_years.items():
indicators_wb = indicators_wb[~((indicators_wb.indicator_name == key) & (indicators_wb.year != year))].copy()
indicators_wb.groupby(['indicator_name', 'year']).size()
indicator_name year Air transport, passengers carried 2018 154 Current health expenditure (% of GDP) 2017 185 GDP per capita (constant 2010 US$) 2019 177 Population ages 65 and above, total 2019 192 Population density (people per sq. km of land area) 2018 206 Population, total 2019 214 dtype: int64
ipv = indicators_wb.pivot_table(index=['country'], columns=['indicator_name'], values='value').reset_index()
ipv['passengers_carried_by_population'] = ipv['Air transport, passengers carried'] / ipv['Population, total']
ipv['65_and_above_by_population'] = ipv['Population ages 65 and above, total'] / ipv['Population, total']
def get_cases(conn):
"""Get the dataframe of the cases and population
Args:
conn (connection): a connection to the OXCOVID-19 DB
Returns:
cases_df (dataframe): dataframe containing COVID-19 data
"""
# Get total number of cases
sql_command = """
SELECT ep.date,
ep.country,
ep.confirmed,
ep.dead,
ep.countrycode,
wb.country AS country_wb,
wb.value AS population
FROM epidemiology AS ep
LEFT JOIN world_bank AS wb
ON ep.countrycode = wb.countrycode
WHERE ep.source = 'WRD_WHO'
AND wb.indicator_name = 'Population, total'
AND wb.year = 2019 -- Maximum year
AND ep.date <= '2020-12-31' -- Making sure there are no cases from 2021
"""
cases_df = pd.read_sql(sql_command, conn)
# Check data quality
assert (cases_df['country'] == cases_df['country_wb']).mean() == 1, 'Not all countries are identical. Re-check join key.'
assert cases_df.isnull().values.any() == False, 'There are missing values'
# Check countries that are not included
sql_command2 = """
SELECT DISTINCT country
FROM epidemiology
EXCEPT
SELECT DISTINCT country
FROM world_bank
"""
excluded = pd.read_sql(sql_command2, conn)
print(f"A total of {len(excluded)} entries are excluded from the initial dataset since they do not have data for Population.\n"
f"These are: {', '.join(excluded['country'])}")
return cases_df
def calculate_capita_metrics(cases_df):
"""Calculate key metrics later used in the analysis
Args:
cases_df (dataframe): dataframe containing COVID-19 data
Returns:
cases_df (dataframe): dataframe with additional metrics
"""
cases_df['confirmed_per_100K'] = (cases_df['confirmed'] / cases_df['population']) * 100_000
cases_df['dead_per_100K'] = (cases_df['dead'] / cases_df['population']) * 100_000
cases_df['dead_per_cases'] = cases_df['dead'] / cases_df['confirmed']
cases_df['date'] = pd.to_datetime(cases_df['date'])
cases_df['month'] = cases_df['date'].dt.month
return cases_df
def get_daily_growth(cases_df):
"""Calculate daily growth of cases from the number of cumulative cases
Args:
cases_df (dataframe): dataframe containing COVID-19 data.
Returns:
cases_df (dataframe): dataframe with COVID-19 case differences.
"""
# Get unique country list
countries = cases_df['country'].unique()
# Sort the df (diff calculations only work if it is sorted by country and year)
cases_df.sort_values(['country', 'date'], ascending = True, inplace = True)
# Get daily growth (proportional and absolute)
differences = {'new_cases_total': 'confirmed',
'new_cases_per_100K': 'confirmed_per_100K',
'new_deaths_total': 'dead'}
for country in countries:
for col, measure in differences.items():
cases_df.loc[cases_df['country'] == country, col] =\
cases_df.loc[cases_df['country'] == country, measure].diff().fillna(0)
return cases_df
def get_newest_df(cases_df, time_lag=10):
"""Get the newest data for the query with available data
Args:
cases_df (dataframe): dataframe containing COVID-19 data.
time_lag (int, optional): number of days to substract from today for extracting data.
Returns:
df_newest (dataframe): dataframe with newest data.
"""
newest_date = (datetime.today() - timedelta(days=time_lag)).strftime('%Y-%m-%d')
df_newest = cases_df[cases_df['date'] == newest_date]
# Check that no data is lost
assert df_newest['country'].nunique() == len(df_newest), 'Countries are not unique'
assert df_newest['country'].nunique() == cases_df['country'].nunique(), f'Not all countries have the date {newest_date}'
return df_newest
def daily_growth_check(cases_df, drop_countries=True, drop_threshold=0.05):
"""Check whether the daily growth of COVID-19 cases is below zero.
Perform a check on the number of daily COVID-19 cases for each country and excludes
countries with a high number of negative daily cases. The daily cases should never be
negative. The methodology for determining whether to drop a country or not is as follows.
1. Extracting the total number of negative growth cases for each country
2. Comparing the total number of negative growth cases to the overall cases in the country
3. If the proportion of these is smaller than the threshold, the negative drop is deemed to be
insigifnicant. Otherwise, the country is removed
Example: Oxlandia had a cumulative number of 40,000 and 39,900 cases on September 1st and 2nd, respectively.
Oxlandia currently has a total of 100 thousand cummulative cases. Since the drop was 100 cases, it is only 0.1%
of if its current cases. If the drop is less than the threshold, Oxlandia is not removed.
Args:
cases_df (dataframe): dataframe containing COVID-19 data.
drop_countries (boolean): indicate whether to drop countries or not.
drop_threshold (float): the threshold beow which countries are dropped.
Returns
cases_df (dataframe): new dataframe without the countries that have overall drops above the specified threshold.
"""
# Check if there are negative cases
negative_diff = np.sum(cases_df['new_cases_total'] < 0)
if negative_diff > 0:
print(f"\nThere are {negative_diff} cases of negative daily growth in cases from the data\n")
# Get the drops for all the countries
negative_df = cases_df[cases_df['new_cases_total'] < 0].copy()
all_drops = negative_df.groupby('country').new_cases_total.sum()
if drop_countries:
# Calculate the countries with a drop above the threshold
df_newest = get_newest_df(cases_df)
drop_countries = []
for country in all_drops.index:
total_cases = df_newest.loc[df_newest['country'] == country, 'confirmed']
perc_drop = - (all_drops[country]/total_cases).values
if perc_drop > drop_threshold:
drop_countries.append(country)
if len(drop_countries) > 0:
print("Dropped {} countries from further analysis: {}.\n".format(len(drop_countries), ', '.join(drop_countries)))
cases_df = cases_df[~cases_df['country'].isin(drop_countries)]
return cases_df
cases_df = get_cases(conn)
cases_df = calculate_capita_metrics(cases_df)
cases_df = get_daily_growth(cases_df)
cases_df = daily_growth_check(cases_df, drop_countries = True)
A total of 30 entries are excluded from the initial dataset since they do not have data for Population. These are: Mayotte, Other continent, Saint Helena, Oceania, Pitcairn Islands, Martinique, Reunion, Wallis and Futuna, Guernsey, Montserrat, Cook Islands, Kosovo, Asia, French Guiana, Bonaire, Sint Eustatius and Saba, Tokelau, Saint-Barthélemy, America, Saint Pierre and Miquelon, Falkland Islands, Taiwan, Niue, Anguilla, World, Guadeloupe, Africa, Western Sahara, Vatican City, Europe, Jersey There are 64 cases of negative daily growth in cases from the data Dropped 4 countries from further analysis: Benin, Ecuador, Puerto Rico, São Tomé and Príncipe.
def get_metrics_df(df):
df['month'] = df['date'].map(lambda x: pd.to_datetime(x).month)
return df
cases_df = get_metrics_df(cases_df)
cultural_df = pd.read_excel("6-dimensions-for-website-2015-08-16.xls")
# Get country estimates
cultural_df['country'] = cultural_df['country'].map(lambda x: x[:-3] + 'Republic' if str(x)[-3:] == 'Rep' else x)
print("Estimated: ", cultural_df[cultural_df['idv'].notnull()]['Estimated'].sum())
change_names = {'Bosnia': 'Bosnia and Herzegovina',
'Great Britain' : 'United Kingdom',
'Kyrgyz Republic': 'Kyrgyzstan',
'Macedonia Republic': 'Macedonia',
'Slovak Republic': 'Slovakia',
'U.S.A.': 'United States',
'Korea South': 'South Korea'}
cultural_df['country'] = cultural_df['country'].replace(change_names)
Estimated: 24.0
df_cult_cases = cultural_df.merge(cases_df, how = 'left', left_on = 'country', right_on = 'country', validate='one_to_many')
cultural_set = set(cultural_df['country'].unique())
cases_set = set(cases_df['country'])
cultural_set.difference(cases_set)
{'Africa East',
'Africa West',
'Arab countries',
'Belgium French',
'Belgium Netherl',
'Canada French',
'Ecuador',
'Germany East',
'Hong Kong',
'Puerto Rico',
'South Africa white',
'Switzerland French',
'Switzerland German',
'Taiwan'}
df1 = df_cult_cases.merge(ipv, how= 'left', left_on='country', right_on='country', validate='many_to_one')
ipv_countries = set(ipv['country'].unique())
df1_countries = set(df1['country'].unique())
df1_countries.difference(ipv_countries)
{'Africa East',
'Africa West',
'Arab countries',
'Belgium French',
'Belgium Netherl',
'Canada French',
'Germany East',
'South Africa white',
'Switzerland French',
'Switzerland German',
'Taiwan'}
def c_count(df):
print("Unique countries:", df['country'].nunique())
c_count(df1)
c_count(cultural_df)
c_count(df_cult_cases)
Unique countries: 111 Unique countries: 111 Unique countries: 111
df1['month'] = pd.to_datetime(df1.date).dt.month
df1_agg = df1.groupby(['country', 'month']).agg({ 'idv': 'mean',
'confirmed': 'max', # latest # of cumulative cases each month
'dead': 'max', # latest dead cases of each month
'population': 'mean',
'Current health expenditure (% of GDP)': 'mean',
'passengers_carried_by_population': 'mean',
'GDP per capita (constant 2010 US$)': 'mean',
'Population ages 65 and above, total': 'mean',
'Population density (people per sq. km of land area)': 'mean',
'65_and_above_by_population': 'mean',
'Population, total': 'mean',
'new_cases_per_100K': 'sum',
'new_cases_total': 'sum',
'new_deaths_total': 'sum'}).reset_index()
def get_logs(df, vars_to_log=['new_cases_total', 'new_cases_per_100K', 'new_deaths_total']):
for var in vars_to_log:
df[var + '_log'] = np.log(df[var] + 0.000001)
return df
df1_agg = get_logs(df1_agg)
ipv_countries = set(df1['country'].unique())
df1_countries = set(df1_agg['country'].unique())
ipv_countries.difference(df1_countries)
{'Africa East',
'Africa West',
'Arab countries',
'Belgium French',
'Belgium Netherl',
'Canada French',
'Ecuador',
'Germany East',
'Hong Kong',
'Puerto Rico',
'South Africa white',
'Switzerland French',
'Switzerland German',
'Taiwan'}
df1_countries.difference(ipv_countries)
set()
c_count(df1_agg)
Unique countries: 97
mobility = pd.read_csv("Global_Mobility_Report.csv")
mob = mobility[['country_region', 'date', 'retail_and_recreation_percent_change_from_baseline', 'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline', 'workplaces_percent_change_from_baseline']].copy()
mob['month'] = pd.to_datetime(mob['date']).dt.month # get month
mob_agg_all = mob.groupby(['country_region', 'month']).mean()#.reset_index() # group by baseline
mob_agg = mob_agg_all.mean(axis=1).reset_index().rename(columns={0:'mobility_index'})
C:\Users\Paulius\AppData\Roaming\Python\Python38\site-packages\IPython\core\interactiveshell.py:3146: DtypeWarning: Columns (4,5) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
df2_agg = df1_agg.merge(mob_agg, how = 'left', left_on = ['country', 'month'], right_on = ['country_region', 'month'], validate = 'one_to_one')
df2_agg['country'].nunique()
97
# Drop month 1
df2_agg = df2_agg[df2_agg['month'] != 1].copy()
fig, axs = plt.subplots(5,2, figsize = (30,15))
ax_list = [item for sublist in axs for item in sublist]
for country in mob['country_region'].unique()[0:10]:
ax = ax_list.pop(0)
df_ = mob[mob['country_region'] == country]
sns.scatterplot(x = 'date', y = 'retail_and_recreation_percent_change_from_baseline', data = df_, ax=ax)
ax.set_title(country)
ax.set_xlabel('')
ax.set_ylabel('')
cols = ['location', 'date', 'new_tests']
tests_df = pd.read_csv("owid-covid-data.csv")[cols]
tests_df = tests_df.fillna(-1) # to get where there is NA
tests_df['month'] = pd.to_datetime(tests_df.date).dt.month
tests_df_agg = tests_df.groupby(['month', 'location']).agg({'new_tests': ['sum', 'min']}
).reset_index().sort_values(['location', 'month'])
tests_df_agg = pd.concat([tests_df_agg[['month', 'location']], tests_df_agg['new_tests']], axis = 1)
tests_df_agg.columns = ['month', 'location', 'sum_new_tests', 'min_new_tests'] # Get where is NA by looking at where min_new_tests == -1
df3_agg = df2_agg.merge(tests_df_agg, how = 'left', left_on = ['country', 'month'], right_on = ['location', 'month'], validate='one_to_one').drop('location', axis = 1)
df3_agg
# Get total number of cases
sql_command = """
SELECT containment_health_index, date, country
FROM government_response
"""
gov_df = pd.read_sql(sql_command, conn)
def group_month(df, var, country_var='country'):
df['month'] = pd.to_datetime(df.date).dt.month
df_agg = df.groupby(['month', country_var]).agg({var: 'mean'}).reset_index().sort_values(['country', 'month'])
return df_agg
gov_df_agg = group_month(gov_df, 'containment_health_index')
df4_agg = df3_agg.merge(gov_df_agg, how = 'left', left_on = ['country', 'month'], right_on = ['country', 'month'], validate='one_to_one')
economist_df = pd.read_excel("The_Economist.xlsx")
economist_df = economist_df.loc[economist_df['time'] == 2019, ['name', 'Democracy index (EIU)']].copy()
economist_df.columns = ['country', 'democracy_index']
df5_agg = df4_agg.merge(economist_df, how = 'left', left_on = ['country'], right_on = ['country'], validate='many_to_one')
from opencage.geocoder import OpenCageGeocode
from geopy.distance import geodesic
from countryinfo import CountryInfo
import apikeys # Manual file
import pycountry_convert as pc
key = apikeys['key'] # Get the key
geocoder = OpenCageGeocode(key)
from functools import lru_cache
capitals = {'Slovakia': 'Bratislava',
'Slovenia': 'Ljubljana',
'South Africa': 'Cape Town',
'South Korea': 'Seoul',
'Spain': 'Madrid',
'Sweden': 'Stockholm',
'Switzerland': 'Bern',
'Tanzania': 'Dodoma',
'Thailand': 'Bangkok',
'Trinidad and Tobago': 'Port of Spain',
'Turkey': 'Ankara',
'Uganda': 'Kampala',
'Ukraine': 'Kyiv',
'United Kingdom': 'London',
'United States': 'Washington, D.C.',
'Uruguay': 'Montevideo',
'Venezuela': 'Caracas',
'Vietnam': 'Hanoi',
'Zambia': 'Lusaka',
'Zimbabwe': 'Harare'}
capitals_2 = {
'Andorra': 'Andorra la Vella',
'Macedonia': 'Skopje',
'Montenegro': 'Podgorica'
}
@lru_cache(maxsize=100)
def get_capital(country, capitals_2=capitals_2):
if country in capitals_2.keys():
return capitals_2[country]
try:
# Get the capital
capital = CountryInfo(country).capital()
return capital
except:
return None
df5_agg['capital'] = df5_agg['country'].map(get_capital)
df5_agg[df5_agg['capital'].map(lambda x: x is None)]['country'].unique()
lat_B, lng_B = geocoder.geocode('Wuhan')[0]['geometry'].values()
array([], dtype=object)
@lru_cache(maxsize=100)
def find_distance(city, lat_B=lat_B, lng_B=lng_B):
result_A = geocoder.geocode(city)
# Get lat, long
lat_A, lng_A = result_A[0]['geometry'].values()
return geodesic((lat_A,lng_A), (lat_B,lng_B)).km
df5_agg['distance'] = df5_agg['capital'].map(find_distance)
c_count(df5_agg)
Unique countries: 97
def get_continent(name):
country_code = pc.country_name_to_country_alpha2(name, cn_name_format="default")
continent_name = pc.country_alpha2_to_continent_code(country_code)
return continent_name
def c_count(df):
print("Unique countries:", df['country'].nunique())
df5_agg['continent'] = df5_agg.country.map(get_continent)
df5_agg['confirmed_log'] = np.log(df5_agg['confirmed'] + 0.0000000000001)
df5_agg.columns = [re.sub(r'[^\w]', '', x.replace(' ', '_')).lower() for x in df5_agg.columns] # Change col names
df5_agg.rename(columns={'current_health_expenditure__of_gdp': 'current_health_expenditure_of_gdp',
'65_and_above_by_population': 'over65_per_capita'}, inplace=True)
df5_agg.to_csv("2021-01-04 stats_dataset.csv", index=False)
from scipy.stats import pearsonr
df = pd.read_csv("2021-01-04 stats_dataset.csv")
def get_individualism_scores(df=df, remove_last_plot=True):
fig, axs = plt.subplots(4,3, figsize = (18,12))
ax_list = [item for sublist in axs for item in sublist]
for month in range(2,13):
ax = ax_list.pop(0)
df_ = df[df['month'] == month]
df_ = df_[df_['new_cases_per_100k'] != 0]
df_ = df_[df_['individualism_index'].notnull()]
sns.regplot(x = 'new_cases_per_100k_log', y = 'individualism_index', data = df_, ax=ax)
corr, p = pearsonr(df_['new_cases_per_100k_log'], df_['individualism_index'])
ax.set_title(f"Month: {month}, R^2: {corr.round(2)}, p = {p.round(3)}")
ax.set_ylim([0,100])
# Set X labels
if month in (1,4,7,10): ax.set_ylabel("Individualism score")
else: ax.set_ylabel("")
if month in (10,11,12): ax.set_xlabel("New cases per 100K people, log")
else: ax.set_xlabel("")
if remove_last_plot:
fig.delaxes(axs[3,2])
fig.suptitle("Relationship between individualism and new monthly COVID-19 cases", fontsize = 16)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
get_individualism_scores()
df_ = df[df['month'] == month]
df_ = df_[df_['new_cases_per_100k'] != 0]
df_ = df_[df_['individualism_index'].notnull()]
sns.regplot(x = 'new_cases_per_100k_log', y = 'individualism_index', data = df_)
sns.regplot(x = 'new_cases_per_100k_log', y = 'individualism_index', data = df_[~df_.country.isin(['Australia', 'New Zealand', 'Vietnam'])])
<AxesSubplot:xlabel='new_cases_per_100k_log', ylabel='individualism_index'>
df_2 = df_[~df_.country.isin(['Australia', 'New Zealand', 'Vietnam'])].copy()
corr, p = pearsonr(df_2['new_cases_per_100k_log'], df_2['individualism_index'])
corr
0.5207701239060172
fig, axs = plt.subplots(4,3, figsize = (17,12))
ax_list = [item for sublist in axs for item in sublist]
for month in range(2,13):
ax = ax_list.pop(0)
df_ = df[df['month'] == month]
df_ = df_[df_['new_cases_per_100k'] != 0]
sns.regplot(x = 'confirmed_log', y = 'idv', data = df_, ax=ax)
ax.set_title(f"Month: {month}")
plt.tight_layout()
countries_unique = df['country'].unique()
fig, axs = plt.subplots(10,10, figsize=(5*10,4*9), sharex=True, sharey=True)
ax_list = [item for sublist in axs for item in sublist]
for country in countries_unique:
ax = ax_list.pop(0)
df_ = df[df['country'] == country]
sns.scatterplot(x='month', y = 'new_cases_per_100k_log', data = df_, ax = ax)
ax.set_title(country)
plt.tight_layout()
drop_c = df.loc[df.idv.isnull(),'country'].unique()
print(f"Dropping {drop_c} countries due to missing values with the dependent variable")
df = df[~df['country'].isin(drop_c)].copy()
Dropping ['Andorra' 'Cyprus' 'Kyrgyzstan' 'Macedonia' 'Mali' 'Rwanda' 'Uganda' 'Zimbabwe'] countries due to missing values with the dependent variable
c_count(df)
Unique countries: 89
null_cols = df.columns[~df.notnull().all()]
df[null_cols].isnull().sum()
current_health_expenditure_of_gdp 22 passengers_carried_by_population 88 gdp_per_capita_constant_2010_us 22 population_density_people_per_sq_km_of_land_area 22 country_region 123 mobility_index 123 sum_new_tests 44 min_new_tests 44 containment_health_index 23 democracy_index 11 continent 110 dtype: int64
for col in null_cols:
countries = df.loc[df[col].isnull(),'country'].unique()
print(f"Column: {col}. Countries missing: {countries}")
Column: current_health_expenditure_of_gdp. Countries missing: ['Albania' 'Montenegro'] Column: passengers_carried_by_population. Countries missing: ['Armenia' 'Bosnia and Herzegovina' 'Denmark' 'Dominican Republic' 'Norway' 'Slovakia' 'Sweden' 'Uruguay'] Column: gdp_per_capita_constant_2010_us. Countries missing: ['Iran' 'Venezuela'] Column: population_density_people_per_sq_km_of_land_area. Countries missing: ['Georgia' 'Moldova'] Column: country_region. Countries missing: ['Albania' 'Algeria' 'Armenia' 'Azerbaijan' 'China' 'Czech Republic' 'Ethiopia' 'Georgia' 'Iceland' 'Iran' 'Montenegro' 'Serbia' 'Suriname'] Column: mobility_index. Countries missing: ['Albania' 'Algeria' 'Armenia' 'Azerbaijan' 'China' 'Czech Republic' 'Ethiopia' 'Georgia' 'Iceland' 'Iran' 'Montenegro' 'Serbia' 'Suriname'] Column: sum_new_tests. Countries missing: ['Albania' 'Armenia' 'Azerbaijan' 'Bangladesh' 'Bosnia and Herzegovina' 'Bulgaria' 'Burkina Faso' 'Colombia' 'Costa Rica' 'Czech Republic' 'Dominican Republic' 'El Salvador' 'Ethiopia' 'Ghana' 'Hungary' 'Indonesia' 'Jamaica' 'Jordan' 'Moldova' 'Montenegro' 'Panama' 'Peru' 'Poland' 'Saudi Arabia' 'Slovakia' 'Slovenia' 'Suriname' 'Tanzania' 'Trinidad and Tobago' 'Turkey' 'Ukraine' 'Uruguay' 'Venezuela' 'Zambia'] Column: min_new_tests. Countries missing: ['Albania' 'Armenia' 'Azerbaijan' 'Bangladesh' 'Bosnia and Herzegovina' 'Bulgaria' 'Burkina Faso' 'Colombia' 'Costa Rica' 'Czech Republic' 'Dominican Republic' 'El Salvador' 'Ethiopia' 'Ghana' 'Hungary' 'Indonesia' 'Jamaica' 'Jordan' 'Moldova' 'Montenegro' 'Panama' 'Peru' 'Poland' 'Saudi Arabia' 'Slovakia' 'Slovenia' 'Suriname' 'Tanzania' 'Trinidad and Tobago' 'Turkey' 'Ukraine' 'Uruguay' 'Venezuela' 'Zambia'] Column: containment_health_index. Countries missing: ['Armenia' 'Montenegro' 'Netherlands'] Column: democracy_index. Countries missing: ['Slovakia'] Column: continent. Countries missing: ['Canada' 'Costa Rica' 'Dominican Republic' 'El Salvador' 'Guatemala' 'Jamaica' 'Mexico' 'Panama' 'Trinidad and Tobago' 'United States']
countries_drop = ['Albania', 'Montenegro', 'Puerto Rico', 'Iran', 'Venezuela', 'Armenia', 'Montenegro', 'Netherlands', 'Slovakia']
df = df[~df.country.isin(countries_drop)].copy()
for col in null_cols:
countries = df.loc[df[col].isnull(),'country'].unique()
print(f"Column: {col}. Countries missing: {countries}")
Column: current_health_expenditure_of_gdp. Countries missing: [] Column: passengers_carried_by_population. Countries missing: ['Bosnia and Herzegovina' 'Denmark' 'Dominican Republic' 'Norway' 'Sweden' 'Uruguay'] Column: gdp_per_capita_constant_2010_us. Countries missing: [] Column: population_density_people_per_sq_km_of_land_area. Countries missing: ['Georgia' 'Moldova'] Column: country_region. Countries missing: ['Algeria' 'Azerbaijan' 'China' 'Czech Republic' 'Ethiopia' 'Georgia' 'Iceland' 'Serbia' 'Suriname'] Column: mobility_index. Countries missing: ['Algeria' 'Azerbaijan' 'China' 'Czech Republic' 'Ethiopia' 'Georgia' 'Iceland' 'Serbia' 'Suriname'] Column: sum_new_tests. Countries missing: ['Azerbaijan' 'Bangladesh' 'Bosnia and Herzegovina' 'Bulgaria' 'Burkina Faso' 'Colombia' 'Costa Rica' 'Czech Republic' 'Dominican Republic' 'El Salvador' 'Ethiopia' 'Ghana' 'Hungary' 'Indonesia' 'Jamaica' 'Jordan' 'Moldova' 'Panama' 'Peru' 'Poland' 'Saudi Arabia' 'Slovenia' 'Suriname' 'Tanzania' 'Trinidad and Tobago' 'Turkey' 'Ukraine' 'Uruguay' 'Zambia'] Column: min_new_tests. Countries missing: ['Azerbaijan' 'Bangladesh' 'Bosnia and Herzegovina' 'Bulgaria' 'Burkina Faso' 'Colombia' 'Costa Rica' 'Czech Republic' 'Dominican Republic' 'El Salvador' 'Ethiopia' 'Ghana' 'Hungary' 'Indonesia' 'Jamaica' 'Jordan' 'Moldova' 'Panama' 'Peru' 'Poland' 'Saudi Arabia' 'Slovenia' 'Suriname' 'Tanzania' 'Trinidad and Tobago' 'Turkey' 'Ukraine' 'Uruguay' 'Zambia'] Column: containment_health_index. Countries missing: [] Column: democracy_index. Countries missing: [] Column: continent. Countries missing: ['Canada' 'Costa Rica' 'Dominican Republic' 'El Salvador' 'Guatemala' 'Jamaica' 'Mexico' 'Panama' 'Trinidad and Tobago' 'United States']
exclude_cols = ['sum_new_tests', 'min_new_tests', 'passengers_carried_by_population', 'country_region']
df = df.drop(exclude_cols, axis = 1).copy()
df['continent'].isnull().sum()
110
df['continent'] = df['continent'].map(lambda x: 'N.A.' if pd.isnull(x) else 'SA' if x == 'Trinidad and Tobago' else x)
for col in df.columns[~df.notnull().all()]:
countries = df.loc[df[col].isnull(),'country'].unique()
print(f"Column: {col}. Countries missing: {countries}")
Column: population_density_people_per_sq_km_of_land_area. Countries missing: ['Georgia' 'Moldova'] Column: mobility_index. Countries missing: ['Algeria' 'Azerbaijan' 'China' 'Czech Republic' 'Ethiopia' 'Georgia' 'Iceland' 'Serbia' 'Suriname']
df = df[~df.country.isin(['Moldova', 'Georgia'])].copy()
for col in df.columns[~df.notnull().all()]:
countries = df.loc[df[col].isnull(),'country'].unique()
print(f"Column: {col}. Countries missing: {countries}")
Column: mobility_index. Countries missing: ['Algeria' 'Azerbaijan' 'China' 'Czech Republic' 'Ethiopia' 'Iceland' 'Serbia' 'Suriname']
drop_countries = ['Algeria', 'Azerbaijan', 'China', 'Czech Republic', 'Ethiopia', 'Iceland', 'Serbia', 'Suriname']
df = df[~df.country.isin(drop_countries)].copy()
for col in df.columns[~df.notnull().all()]:
countries = df.loc[df[col].isnull(),'country'].unique()
print(f"Column: {col}. Countries missing: {countries}")
from countryinfo import CountryInfo
import pycountry
cases_df = get_cases(conn)
cases_df = calculate_capita_metrics(cases_df)
cases_df = get_daily_growth(cases_df)
cases_df = daily_growth_check(cases_df, drop_countries = True)
A total of 30 entries are excluded from the initial dataset since they do not have data for Population. These are: Mayotte, Other continent, Saint Helena, Oceania, Pitcairn Islands, Martinique, Reunion, Wallis and Futuna, Guernsey, Montserrat, Cook Islands, Kosovo, Asia, French Guiana, Bonaire, Sint Eustatius and Saba, Tokelau, Saint-Barthélemy, America, Saint Pierre and Miquelon, Falkland Islands, Taiwan, Niue, Anguilla, World, Guadeloupe, Africa, Western Sahara, Vatican City, Europe, Jersey There are 64 cases of negative daily growth in cases from the data Dropped 4 countries from further analysis: Benin, Ecuador, Puerto Rico, São Tomé and Príncipe.
def get_month_agg(df):
df['month'] = pd.to_datetime(df.date).dt.month
df1_agg = df.groupby(['country', 'month']).agg({
'confirmed': 'max', # latest # of cumulative cases each month
'new_cases_per_100K': 'sum'}).reset_index()
return df1_agg
cases_df = get_month_agg(cases_df)
def get_neighbors_all(df):
countries_correct = {'Albania': ['MNE', 'GRC', 'MKD'],
'Serbia': ['HUN', 'ROU', 'BGR', 'MKD', 'HRV', 'BIH', 'MNE', 'ALB'],
'South Korea': []}
change_names = {'Russian Federation': 'Russia',
'Iran, Islamic Republic of': 'Iran',
'Bolivia, Plurinational State of': 'Bolivia',
'Czechia': 'Czech Republic',
'Tanzania, United Republic of': 'Tanzania',
'Viet Nam': 'Vietnam'
}
for country in df.country.unique():
print(f"Country: {country}")
# Get neighbors (short names)
try:
countries_short = CountryInfo(country).borders()
except KeyError:
print(f"Passing country {country}")
continue
# Add corrections due to package problems
if country in countries_correct.keys():
countries_short = countries_correct[country]
# Get full list of countries
get_name = lambda x: pycountry.countries.get(alpha_3=x).name
bordering_countries = list(map(get_name, countries_short))
bordering_countries = [change_names[x] if x in change_names.keys() else x for x in bordering_countries]
# Get country average cases for the country
unique_after = df[df.country.isin(bordering_countries)].country.unique()
if len(unique_after) != len(bordering_countries):
set_diff = set(bordering_countries).difference(set(unique_after))
print(f"Country {country} missing: {set_diff}")
country_avg_cases = df.loc[df.country.isin(bordering_countries)].groupby(['month']).new_cases_per_100K.mean().reset_index()
# Check if there are any borders
if len(bordering_countries) == 0:
continue
# Get country average cases for the country
country_avg_cases = df.loc[df.country.isin(bordering_countries)].groupby(['month']).new_cases_per_100K.mean().reset_index()
# Get the indices for the main country (required to assign series to df)
indices = df.loc[(df.country == country)].index
values_to_set = country_avg_cases.set_index(indices)['new_cases_per_100K']
# Set the dataframe values
df.loc[(df.country == country), 'cases_neighbors'] = values_to_set
return df
cases_df = get_neighbors_all(cases_df)
Country: Afghanistan
Country: Albania
Country Albania missing: {'North Macedonia'}
Country: Algeria
Country Algeria missing: {'Western Sahara'}
Country: American Samoa
Country: Andorra
Passing country Andorra
Country: Angola
Country Angola missing: {'Congo, The Democratic Republic of the', 'Namibia', 'Congo'}
Country: Antigua and Barbuda
Country: Argentina
Country: Armenia
Country: Aruba
Country: Australia
Country: Austria
Country: Azerbaijan
Country: Bahamas
Passing country Bahamas
Country: Bahrain
Country: Bangladesh
Country: Barbados
Country: Belarus
Country: Belgium
Country: Belize
Country: Bermuda
Country: Bhutan
Country: Bolivia
Country: Bosnia and Herzegovina
Country: Botswana
Country Botswana missing: {'Namibia'}
Country: Brazil
Country Brazil missing: {'French Guiana', 'Venezuela, Bolivarian Republic of'}
Country: British Virgin Islands
Passing country British Virgin Islands
Country: Brunei
Country: Bulgaria
Country Bulgaria missing: {'North Macedonia'}
Country: Burkina Faso
Country Burkina Faso missing: {'Benin'}
Country: Burundi
Country Burundi missing: {'Congo, The Democratic Republic of the'}
Country: Cambodia
Country Cambodia missing: {"Lao People's Democratic Republic"}
Country: Cameroon
Country Cameroon missing: {'Congo'}
Country: Canada
Country: Cape Verde
Country: Cayman Islands
Country: Central African Republic
Country Central African Republic missing: {'Congo, The Democratic Republic of the', 'Congo'}
Country: Chad
Country: Chile
Country: China
Country China missing: {"Lao People's Democratic Republic", 'Hong Kong', "Korea, Democratic People's Republic of", 'Macao'}
Country: Colombia
Country Colombia missing: {'Ecuador', 'Venezuela, Bolivarian Republic of'}
Country: Comoros
Country: Costa Rica
Country: Croatia
Country: Cuba
Country: Cyprus
Country: Czech Republic
Country: Côte d'Ivoire
Passing country Côte d'Ivoire
Country: Democratic Republic of the Congo
Country Democratic Republic of the Congo missing: {'Congo'}
Country: Denmark
Country: Djibouti
Country Djibouti missing: {'Eritrea'}
Country: Dominica
Country: Dominican Republic
Country: Egypt
Country: El Salvador
Country: Equatorial Guinea
Country: Estonia
Country: Ethiopia
Country Ethiopia missing: {'Eritrea'}
Country: Faroe Islands
Country: Fiji
Country: Finland
Country: France
Country: French Polynesia
Country: Gabon
Country Gabon missing: {'Congo'}
Country: Gambia
Passing country Gambia
Country: Georgia
Country: Germany
Country: Ghana
Country: Gibraltar
Country: Greece
Country Greece missing: {'North Macedonia'}
Country: Greenland
Country: Grenada
Country: Guam
Country: Guatemala
Country: Guinea
Country: Guinea-Bissau
Country: Guyana
Country Guyana missing: {'Venezuela, Bolivarian Republic of'}
Country: Haiti
Country: Honduras
Country: Hungary
Country: Iceland
Country: India
Country: Indonesia
Country: Iran
Country: Iraq
Country Iraq missing: {'Syrian Arab Republic'}
Country: Ireland
Country: Isle of Man
Country: Israel
Country Israel missing: {'Syrian Arab Republic'}
Country: Italy
Country Italy missing: {'Holy See (Vatican City State)'}
Country: Jamaica
Country: Japan
Country: Jordan
Country Jordan missing: {'Syrian Arab Republic'}
Country: Kazakhstan
Country: Kenya
Country: Kiribati
Country: Kuwait
Country: Kyrgyzstan
Country: Laos
Country: Latvia
Country: Lebanon
Country Lebanon missing: {'Syrian Arab Republic'}
Country: Lesotho
Country: Liberia
Country: Libya
Country: Liechtenstein
Country: Lithuania
Country: Luxembourg
Country: Macedonia
Passing country Macedonia
Country: Madagascar
Country: Malawi
Country: Malaysia
Country Malaysia missing: {'Brunei Darussalam'}
Country: Maldives
Country: Mali
Country: Malta
Country: Marshall Islands
Country: Mauritania
Country Mauritania missing: {'Western Sahara'}
Country: Mauritius
Country: Mexico
Country: Micronesia
Passing country Micronesia
Country: Moldova
Country: Monaco
Country: Mongolia
Country: Montenegro
Passing country Montenegro
Country: Morocco
Country Morocco missing: {'Western Sahara'}
Country: Mozambique
Country Mozambique missing: {'Eswatini'}
Country: Myanmar
Passing country Myanmar
Country: Nauru
Country: Nepal
Country: Netherlands
Country: New Caledonia
Country: New Zealand
Country: Nicaragua
Country: Niger
Country Niger missing: {'Benin'}
Country: Nigeria
Country Nigeria missing: {'Benin'}
Country: North Korea
Country North Korea missing: {'Korea, Republic of'}
Country: Northern Mariana Islands
Country: Norway
Country: Oman
Country: Pakistan
Country: Palau
Country: Palestina
Passing country Palestina
Country: Panama
Country: Papua New Guinea
Country: Paraguay
Country: Peru
Country Peru missing: {'Ecuador'}
Country: Philippines
Country: Poland
Country: Portugal
Country: Qatar
Country: Republic of Congo
Passing country Republic of Congo
Country: Romania
Country Romania missing: {'Moldova, Republic of'}
Country: Russia
Country Russia missing: {"Korea, Democratic People's Republic of"}
Country: Rwanda
Country Rwanda missing: {'Congo, The Democratic Republic of the'}
Country: Saint Kitts and Nevis
Country: Saint Lucia
Country: Saint Vincent and the Grenadines
Country: Saint-Martin
Passing country Saint-Martin
Country: Samoa
Country: San Marino
Country: Saudi Arabia
Country: Senegal
Country: Serbia
Country Serbia missing: {'North Macedonia'}
Country: Seychelles
Country: Sierra Leone
Country: Singapore
Country: Slovakia
Country: Slovenia
Country: Solomon Islands
Country: Somalia
Country: South Africa
Country South Africa missing: {'Eswatini', 'Namibia'}
Country: South Korea
Country: South Sudan
Country South Sudan missing: {'Congo, The Democratic Republic of the'}
Country: Spain
Country: Sri Lanka
Country: Sudan
Country Sudan missing: {'Eritrea'}
Country: Suriname
Country Suriname missing: {'French Guiana'}
Country: Swaziland
Country: Sweden
Country: Switzerland
Country: Syria
Country: Tajikistan
Country: Tanzania
Country Tanzania missing: {'Congo, The Democratic Republic of the'}
Country: Thailand
Country Thailand missing: {"Lao People's Democratic Republic"}
Country: Timor-Leste
Passing country Timor-Leste
Country: Togo
Country Togo missing: {'Benin'}
Country: Tonga
Country: Trinidad and Tobago
Country: Tunisia
Country: Turkey
Country Turkey missing: {'Syrian Arab Republic'}
Country: Turkmenistan
Country: Turks and Caicos Islands
Passing country Turks and Caicos Islands
Country: Tuvalu
Country: Uganda
Country Uganda missing: {'Congo, The Democratic Republic of the'}
Country: Ukraine
Country Ukraine missing: {'Moldova, Republic of'}
Country: United Arab Emirates
Country: United Kingdom
Country: United States
Country: Uruguay
Country: Uzbekistan
Country: Vanuatu
Country: Venezuela
Country: Vietnam
Country Vietnam missing: {"Lao People's Democratic Republic"}
Country: Virgin Islands, U.S.
Passing country Virgin Islands, U.S.
Country: Yemen
Country: Zambia
Country Zambia missing: {'Congo, The Democratic Republic of the', 'Namibia'}
Country: Zimbabwe
cases_df[cases_df.country.isin(['Russia', 'Belarus', 'Latvia', 'Poland']) & (cases_df.month == 6)]['new_cases_per_100K'].mean()
102.71813229329884
cases_df[cases_df.country == 'Lithuania']
| country | month | confirmed | new_cases_per_100K | cases_neighbors | |
|---|---|---|---|---|---|
| 1248 | Lithuania | 1 | 146637 | 217.378511 | 81.953512 |
| 1249 | Lithuania | 2 | 1 | 0.035883 | 0.002641 |
| 1250 | Lithuania | 3 | 484 | 17.331433 | 7.070394 |
| 1251 | Lithuania | 4 | 1449 | 31.971650 | 65.594832 |
| 1252 | Lithuania | 5 | 1670 | 10.585451 | 137.057066 |
| 1253 | Lithuania | 6 | 1816 | 5.238901 | 102.718132 |
| 1254 | Lithuania | 7 | 2062 | 8.827189 | 57.774407 |
| 1255 | Lithuania | 8 | 2874 | 29.136902 | 53.677582 |
| 1256 | Lithuania | 9 | 4578 | 61.144434 | 68.289774 |
| 1257 | Lithuania | 10 | 13823 | 331.737263 | 344.114493 |
| 1258 | Lithuania | 11 | 61325 | 1704.508756 | 788.182962 |
| 1259 | Lithuania | 12 | 140579 | 2843.862089 | 793.408555 |
df = pd.merge(df, cases_df[['country', 'month', 'cases_neighbors']], left_on = ['country', 'month'], right_on = ['country', 'month'], how = 'left')
countries_no_neighbors = df[df.cases_neighbors.isnull()]['country'].unique()
print(countries_no_neighbors)
['Australia' 'Jamaica' 'Japan' 'Malta' 'New Zealand' 'Philippines' 'Singapore' 'South Korea' 'Trinidad and Tobago']
These countries don't have the metric, so take the continent average
continent_avgs = df.groupby(['continent', 'month']).new_cases_per_100k.mean().reset_index()
for c in countries_no_neighbors:
continent = df.loc[df.country == c, 'continent'].iloc[0]
ind = df.loc[df.country == c, 'cases_neighbors'].index
set_values = continent_avgs.loc[continent_avgs.continent == continent, ['new_cases_per_100k']].set_index(ind)
df.loc[df.country == c, 'cases_neighbors'] = set_values['new_cases_per_100k']
countries_no_neighbors_updated = df[df.cases_neighbors.isnull()]['country'].unique()
print(countries_no_neighbors_updated)
[]
df.rename(columns={'idv': 'individualism_index',
'current_health_expenditure_of_gdp' :'health_expenditure',
'gdp_per_capita_constant_2010_us': 'gdp_per_capita',
'population_density_people_per_sq_km_of_land_area': 'population_density',
}, inplace = True)
for col in ['health_expenditure', 'distance', 'gdp_per_capita']:
df[col+'_log'] = np.log(df[col])
# Adding new changes to the model
def get_interactions(df, month_12=False):
if month_12:
df['individualism_gov'] = df['individualism_index'] * df['containment_health_index_avg']
df['individualism_mob'] = df['individualism_index'] * df['mobility_index_avg']
else:
df['individualism_gov'] = df['individualism_index'] * df['containment_health_index']
df['individualism_mob'] = df['individualism_index'] * df['mobility_index']
return df
df = get_interactions(df)
df['cases_neighbors_lag'] = df['cases_neighbors'].shift(1)
df.loc[df.month == 2, 'cases_neighbors_lag'] = 0
df.to_csv("2021-01-04 final_data.csv", index=False)
df = pd.read_csv('2021-01-04 final_data.csv')
df['country'].nunique()
72
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov',
'cases_neighbors']
df2 = df[vars_to_use]
df2.columns = ['ind', 'gdp', 'popdens', 'mob', 'containm', 'dem', 'dist', 'cont', 'ind_gov', 'neighb']
with open('summary_table.tex','w') as tf:
tf.write(df2.describe().round(2).to_latex())
df2.columns = [f'X_{n}' for n in range(1,11)]
def get_individualism_scores(df=df, remove_last_plot=True, export_to_png=False, title=True):
fig, axs = plt.subplots(3,4, figsize = (20,8))
ax_list = [item for sublist in axs for item in sublist]
for month in range(2,13):
ax = ax_list.pop(0)
df_ = df[df['month'] == month]
df_ = df_[df_['new_cases_per_100k'] != 0]
df_ = df_[df_['individualism_index'].notnull()]
sns.regplot(x = 'new_cases_per_100k_log', y = 'individualism_index', data = df_, ax=ax)
corr, p = pearsonr(df_['new_cases_per_100k_log'], df_['individualism_index'])
ax.set_title(f"{calendar.month_name[month]}, $R^2$: {corr.round(2)}, p = {p.round(3)}", fontsize = 14)
ax.set_ylim([0,100])
# Set X labels
if month in (2,6,10): ax.set_ylabel("Individualism score")
else: ax.set_ylabel("")
if month in (9,10,11,12): ax.set_xlabel("New cases per 100K people, log")
else: ax.set_xlabel("")
if remove_last_plot:
fig.delaxes(axs[2,3])
if title:
fig.suptitle("Relationship between individualism and new monthly COVID-19 cases", fontsize = 24)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
if export_to_png:
fig.savefig('Individualism_newcases4.png')
get_individualism_scores(export_to_png=True, title=False)
df.columns
Index(['country', 'month', 'individualism_index', 'confirmed', 'dead',
'population', 'health_expenditure', 'gdp_per_capita',
'population_ages_65_and_above_total', 'population_density',
'over65_per_capita', 'population_total', 'new_cases_per_100k',
'new_cases_total', 'new_deaths_total', 'new_cases_total_log',
'new_cases_per_100k_log', 'new_deaths_total_log', 'mobility_index',
'containment_health_index', 'democracy_index', 'capital', 'distance',
'continent', 'confirmed_log', 'cases_neighbors', 'distance_log',
'individualism_gov', 'individualism_mob', 'health_expenditure_log',
'gdp_per_capita_log', 'gdp'],
dtype='object')
month = 12
def label_point(x, y, val, ax):
a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
for i, point in a.iterrows():
if point['val'] in ['Australia', 'New Zealand', 'Vietnam']:
ax.text(point['x']+.25, point['y'], str(point['val']))
fig, ax = plt.subplots(1,1,figsize=(8,7))
df_ = df[df['month'] == month]
df_ = df_[df_['new_cases_per_100k'] != 0]
df_ = df_[df_['individualism_index'].notnull()]
df_2 = df_[~df_.country.isin(['Australia', 'New Zealand', 'Vietnam'])]
f1 = sns.regplot(x = 'new_cases_per_100k_log', y = 'individualism_index', data = df_, ax=ax)
f2 = sns.regplot(x = 'new_cases_per_100k_log', y = 'individualism_index', data = df_2, ax=ax,
scatter_kws = {'color': 'midnightblue'}, line_kws = {'color': 'midnightblue'})
corr, p = pearsonr(df_['new_cases_per_100k_log'], df_['individualism_index'])
print(corr,p)
corr2, p2 = pearsonr(df_2['new_cases_per_100k_log'], df_2['individualism_index'])
print(corr2,p2)
#ax.set_title(f"{calendar.month_name[month]}, $R^2$: {corr.round(2)}, p = {p.round(3)}")
blue_patch = mpatches.Patch(color='tab:blue', label='Line of best fit, all data')
midnight_patch = mpatches.Patch(color='midnightblue', label='Line of best fit, selected data')
#plt.legend(handles=[red_patch, midnight_patch], bbox_to_anchor = (1.01, 1), loc = 'upper left', frameon=True)
plt.legend(handles=[blue_patch, midnight_patch], loc = 'lower right', frameon=False)
label_point(df_.new_cases_per_100k_log, df_.individualism_index, df_.country, plt.gca())
fig.savefig('month_12.png')
0.31727538808414757 0.0070178711833126855 0.5207701239060172 5.298551251561445e-06
df['confirmed_per_100k'] = (df['confirmed'] / df['population']) * 100_000
df['confirmed_per_100k_log'] = np.log(df['confirmed_per_100k'] + 0.0000000000001)
def get_individualism_scores(df=df, remove_last_plot=True, export_to_png=False, title=True, cumulative=False):
if cumulative: individualism='confirmed_per_100k_log'
else: individualism = 'new_cases_per_100k_log'
fig, axs = plt.subplots(3,4, figsize = (20,8))
ax_list = [item for sublist in axs for item in sublist]
for month in range(2,13):
ax = ax_list.pop(0)
df_ = df[df['month'] == month]
df_ = df_[df_['new_cases_per_100k'] != 0]
df_ = df_[df_['individualism_index'].notnull()]
sns.regplot(x = individualism, y = 'individualism_index', data = df_, ax=ax)
corr, p = pearsonr(df_[individualism], df_['individualism_index'])
ax.set_title(f"{calendar.month_name[month]}, $R^2$: {corr.round(2)}, p = {p.round(3)}")
ax.set_ylim([0,100])
# Set X labels
if month in (2,6,10): ax.set_ylabel("Individualism score")
else: ax.set_ylabel("")
if not cumulative:
if month in (10,11,12): ax.set_xlabel("New cases per 100K people, log")
else: ax.set_xlabel("")
else:
if month in (9,10,11,12): ax.set_xlabel("Cumulative cases per 100K people, log")
else: ax.set_xlabel("")
if remove_last_plot:
fig.delaxes(axs[2,3])
if not cumulative:
if title:
fig.suptitle("Relationship between individualism and new monthly COVID-19 cases", fontsize = 16)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
if not cumulative:
if export_to_png:
fig.savefig('Individualism_newcases3.png')
else:
if export_to_png:
fig.savefig('Individualism_cumulativecases3.png')
get_individualism_scores(title=False, cumulative=True, export_to_png=True)
get_individualism_scores(title=False, cumulative=False, export_to_png=True)
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov',
'cases_neighbors']
#sns.regplot(x=vars_to_use, y='individualism_index', data=df_newest)
sns.pairplot(df_newest[vars_to_use], y_vars='individualism_index')
<seaborn.axisgrid.PairGrid at 0x1efd73afdc0>
sns.pairplot(df[vars_to_use], hue = 'continent')
<seaborn.axisgrid.PairGrid at 0x1efc2fd58e0>
# Establish connection
conn = psycopg2.connect(
host='covid19db.org',
port=5432,
dbname='covid19',
user='covid19',
password='covid19')
cur = conn.cursor()
def get_gov_data(threshold=0.8):
"""Extract governmental data from the postgres database and checks for high similarities between entries
Args:
threshold (float): the threshold for deciding whether strings are similar
Returns:
gov_df (dataframe): dataframe containing government response information
"""
# Extract the data
sql_command = """
SELECT date, country,c6_Stay_at_home_requirements, c6_Flag
FROM government_response
WHERE source = 'GOVTRACK'
"""
gov_df = pd.read_sql(sql_command, conn)
gov_df['date'] = pd.to_datetime(gov_df['date'])
# Inspect countries for similarities (to try to find duplicate entries)
countries_lower = [c.lower() for c in gov_df['country'].unique()]
print("Countries with similar strings: ")
for c1, c2 in itertools.combinations(countries_lower, 2):
if SequenceMatcher(None, c1, c2).ratio() > threshold:
print(f"Country 1: {c1} Country 2: {c2}")
return gov_df
def get_stay_at_home_metrics(gov_df):
"""Add additional stay-at-home metrics to the dataframe
The function finds the first time a government imposed stay at home requirements
on a national scale that are at least 2 or 3 in strength, as measured by the
C6 index on the OXCOVID-19 database. Local restrictions are not taken into account.
The function calculates:
- the number of days before and after the introduced requirements for each day and country
- the number of weeks before and after the introduced requirements for each day and country
The weeks are calculated on a daily basis as opposed to taking calendar weeks. For example,
the first 7 days after the introduction of the requirements are considered week 1, regardless
of the day of the week. Note that this calculation does not take into account if a government
uplifted its requirements.
Args:
gov_df (dataframe): Dataframe containing government response information
Returns:
gov_df (dataframe): New dataframe that includes additional columns on government response
"""
# Get countries with stay at home requirements on a national level
countries_with_req = gov_df.loc[(gov_df['c6_stay_at_home_requirements'].isin([2,3])), 'country'].unique()
gov_df = gov_df[gov_df['country'].isin(countries_with_req)].copy()
gov_df.sort_values(['country', 'date'], inplace = True)
gov_df.index = pd.RangeIndex(len(gov_df))
# Loop over all the countries
for country in countries_with_req:
# Filter country specific values
gov_filter = gov_df.country == country
first_country_filter = gov_df.loc[gov_filter].index.min()
# Get the id of stay at home requirements
home_id_any_level = gov_df.loc[gov_filter, 'c6_stay_at_home_requirements'].isin([2,3]).idxmax()
#c5_flag = (gov_df.loc[gov_filter, 'c6_flag'] == 1).idxmax()
home_id = home_id_any_level
# Get days before and after filters
days_before_filter = gov_df.index < home_id
days_after_filter = gov_df.index >= home_id
# Assign the number of days before and after for each country
gov_df.loc[gov_filter & days_before_filter, 'days_after'] = range(first_country_filter-home_id, 0, 1)
gov_df.loc[gov_filter & days_after_filter, 'days_after'] = range(0, np.sum(gov_filter & days_after_filter))
gov_df.loc[gov_filter, 'weeks_after'] = gov_df.loc[gov_filter, 'days_after'].map(
lambda day: math.ceil(day/7) if day > 0 else math.floor(day/7))
return gov_df
gov_df = get_gov_data()
# Excluding "Viet nam" since it is entered two times
gov_df = gov_df[gov_df['country'] != 'Viet Nam']
# Extracting additional metrics for plotting
gov_df = get_stay_at_home_metrics(gov_df)
Countries with similar strings: Country 1: iceland Country 2: ireland Country 1: australia Country 2: austria Country 1: vietnam Country 2: viet nam Country 1: niger Country 2: nigeria Country 1: gambia Country 2: zambia
gov_df[gov_df.country == 'United States'].sort_values('date')
| date | country | c6_stay_at_home_requirements | c6_flag | days_after | weeks_after |
|---|
gov_df[gov_df.country == 'United States']['c6_stay_at_home_requirements'].max()
nan
df2 = pd.read_excel("6-dimensions-for-website-2015-08-16.xls")
df2 = df2[['country', 'idv']]
df2.head()
| country | idv | |
|---|---|---|
| 0 | Africa East | 27.0 |
| 1 | Africa West | 20.0 |
| 2 | Albania | 20.0 |
| 3 | Algeria | 35.0 |
| 4 | Andorra | NaN |
gov_df = gov_df.merge(df2[['country', 'idv']], left_on='country', right_on='country', how='left')
gov_df[gov_df.idv.notnull()].country.nunique()
67
gov_df = gov_df[gov_df.idv.notnull()]
cases_df = get_cases(conn)
cases_df = calculate_capita_metrics(cases_df)
cases_df = get_daily_growth(cases_df)
cases_df = daily_growth_check(cases_df, drop_countries = True)
A total of 30 entries are excluded from the initial dataset since they do not have data for Population. These are: Mayotte, Other continent, Saint Helena, Oceania, Pitcairn Islands, Martinique, Reunion, Wallis and Futuna, Guernsey, Montserrat, Cook Islands, Kosovo, Asia, French Guiana, Bonaire, Sint Eustatius and Saba, Tokelau, Saint-Barthélemy, America, Saint Pierre and Miquelon, Falkland Islands, Taiwan, Niue, Anguilla, World, Guadeloupe, Africa, Vatican City, Western Sahara, Europe, Jersey There are 64 cases of negative daily growth in cases from the data Dropped 4 countries from further analysis: Benin, Ecuador, Puerto Rico, São Tomé and Príncipe.
gov_df = gov_df.merge(cases_df[['date', 'country', 'new_cases_per_100K']], on = ['date', 'country'], how = 'left')
def get_categories(v):
"""Divide a number into distinct categories of 10. For instance, '5' becomes '0-10' and 23 becomes '20-30'
Args:
v (float): a number
Returns:
range_value (string): a range between the two numbers
"""
v1 = str(int(v/25)*25)
v2 = str(int(v1) + 25)
range_value = v1 + '-' + v2
return range_value
def get_agg_rural_data(gov_df):
"""Get aggregated data based on the dataframe and prepares it for the right
format for plotting
Args:
gov_df (dataframe): Dataframe containing government response information
Returns:
df_at_home_agg (dataframe): Aggregated information of the previous dataframe
"""
# Create rural categories
gov_df['idv_categ'] = gov_df['idv'].map(get_categories)
# Create the values to be plotted
df_at_home_agg = pd.pivot_table(gov_df[gov_df['weeks_after'].between(-8, 15)],
index = ['idv_categ'],
columns = 'weeks_after',
values = 'new_cases_per_100K',
aggfunc = 'mean').drop(0, axis = 1)
# Convert weeks to intigers
df_at_home_agg.columns = [int(x) for x in df_at_home_agg.columns]
# Reverse order to ensure it is from low to high
df_at_home_agg = df_at_home_agg.loc[df_at_home_agg.index[::-1]]
return df_at_home_agg
df_at_home_agg = get_agg_rural_data(gov_df)
df_at_home_agg_log = np.log(df_at_home_agg)
def plot_cases(df_at_home_agg, set_params_max=True, vmin=0, vmax = 6):
"""Plot a heatmap of number of new weekly cases per 100K by rural category and week after
the introduction of the nation-wide stay at home requirements.
Args:
df_at_home_agg (dataframe): Aggregated information on information to plot
Returns:
None
"""
# Set palette
cmap = sns.dark_palette("#69d", reverse = True, as_cmap=True)
# Create subplot
fig, ax = plt.subplots(1,1, figsize = (15,5))
fig.suptitle('Figure 8. Number of new weekly cases per 100K people after stay-at-home requirement introduction', fontsize=15)
# Create heatmap and params
if set_params_max:
sns.heatmap(df_at_home_agg, vmin = vmin, vmax = vmax, cmap = cmap, linecolor = 'white', linewidth = 0.01, ax = ax)
else:
sns.heatmap(df_at_home_agg, cmap = cmap, linecolor = 'white', linewidth = 0.01, ax = ax)
ax.vlines([8], *ax.get_xlim(), linestyle = 'solid', colors = 'black', label = 'test', lw = 3)
plt.text(6,0,'Requirements introduced\n', fontdict = {'weight': 100, 'size': 12})
ax.set_xlabel('Weeks after the introduction of nation-wide stay-at-home requirements')
ax.set_ylabel('Rural population in the country, %')
plt.yticks(rotation=0);
df.columns
Index(['country', 'month', 'individualism_index', 'confirmed', 'dead',
'population', 'health_expenditure', 'gdp_per_capita',
'population_ages_65_and_above_total', 'population_density',
'over65_per_capita', 'population_total', 'new_cases_per_100k',
'new_cases_total', 'new_deaths_total', 'new_cases_total_log',
'new_cases_per_100k_log', 'new_deaths_total_log', 'mobility_index',
'containment_health_index', 'democracy_index', 'capital', 'distance',
'continent', 'confirmed_log', 'cases_neighbors', 'distance_log',
'individualism_gov', 'individualism_mob', 'health_expenditure_log',
'gdp_per_capita_log'],
dtype='object')
index = df.groupby('country').agg({'containment_health_index': ['min', 'mean', 'max', 'std']})
index['containment_health_index']['min
38.30793103448276
plot_cases(df_at_home_agg)
plot_cases(df_at_home_agg_log, set_params_max=True, vmin= -8, vmax = 2.4)
def gov_intervention_plot(gov_df):
"""Plot the dates of when the government intervened for the first time in different rural regions.
Args:
gov_df (dataframe): Dataframe containing government response information
Returns:
None
"""
# Get days when the government intervened with the policy
df_plot = gov_df.loc[gov_df['days_after'] == 0, ['date', 'rural_category']]
# Get date format
date_format = mdates.DateFormatter('%b')
# Create the plot
fig, axs = plt.subplots(1,3, figsize=(15,4), sharex = True)
fig.suptitle('Figure 7. Covid-19 and the first time of national stay at home requirements by rural area type')
# Dictionary for looping
rural_dict = {'Low': 0,
'Medium': 1,
'High': 2}
# Configure each graph in a loop
for category, i in rural_dict.items():
sns.histplot(df_plot[df_plot['rural_category'] == category], x = 'date', ax = axs[i], bins = 10)
axs[i].set_title(f"Rural category: {category}")
axs[i].xaxis.set_major_formatter(date_format)
axs[i].set_ylabel("Number of countries")
if i != 0: axs[i].set_ylabel("")
# Add annotation
annotation_text = """Note: The y-axes are different for each plot to showcase the relative distributions within each group"""
annotate_plot(annotation_text, top = 0.8, bottom = 0.2, x1=.06, x2=.05)
df = pd.read_csv('2021-01-04 final_data.csv')
df.head()
| country | month | individualism_index | confirmed | dead | population | health_expenditure | gdp_per_capita | population_ages_65_and_above_total | population_density | ... | capital | distance | continent | confirmed_log | cases_neighbors | distance_log | individualism_gov | individualism_mob | health_expenditure_log | gdp_per_capita_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Argentina | 2.0 | 46.0 | 0.0 | 0.0 | 44938712.0 | 9.124315 | 9729.141574 | 5052508.0 | 16.25851 | ... | Buenos Aires | 19206.559549 | SA | -29.933606 | 0.000474 | 9.863007 | 353.740000 | -32.791687 | 2.210943 | 9.182881 |
| 1 | Argentina | 3.0 | 46.0 | 820.0 | 20.0 | 44938712.0 | 9.124315 | 9729.141574 | 5052508.0 | 16.25851 | ... | Buenos Aires | 19206.559549 | SA | 6.709304 | 5.122697 | 9.863007 | 2072.641290 | -1506.103166 | 2.210943 | 9.182881 |
| 2 | Argentina | 4.0 | 46.0 | 4201.0 | 207.0 | 44938712.0 | 9.124315 | 9729.141574 | 5052508.0 | 16.25851 | ... | Buenos Aires | 19206.559549 | SA | 8.343078 | 23.515800 | 9.863007 | 3757.740000 | -3130.225314 | 2.210943 | 9.182881 |
| 3 | Argentina | 5.0 | 46.0 | 14702.0 | 510.0 | 44938712.0 | 9.124315 | 9729.141574 | 5052508.0 | 16.25851 | ... | Buenos Aires | 19206.559549 | SA | 9.595739 | 138.163769 | 9.863007 | 3611.756774 | -2817.033416 | 2.210943 | 9.182881 |
| 4 | Argentina | 6.0 | 46.0 | 59933.0 | 1245.0 | 44938712.0 | 9.124315 | 9729.141574 | 5052508.0 | 16.25851 | ... | Buenos Aires | 19206.559549 | SA | 11.000983 | 318.002633 | 9.863007 | 3575.672000 | -2429.974159 | 2.210943 | 9.182881 |
5 rows × 31 columns
def export_csv(df, export_no_two=True,
export_dummies=True,
export_dummies_no_two=False,
mean_vars_export=False,
vars_to_mean = ['individualism_index',
'health_expenditure',
'gdp_per_capita',
'over65_per_capita',
'population_density',
'mobility_index',
'containment_health_index',
'democracy_index',
'distance']):
if export_no_two:
df[df.month != 2].to_csv('2021-01-04 clean_data_from_3.csv', index=False)
if export_dummies:
df_cont = pd.concat((df.drop('continent', axis=1), pd.get_dummies(df['continent'], prefix = 'continent').iloc[:,0:5]), axis=1)
df_cont.to_csv("2021-01-03 clean_data_w_dummies.csv", index=False)
if export_dummies_no_two:
df_cont[df_cont.month != 2].to_csv("2021-01-03 clean_data_w_dummies.csv", index=False)
if mean_vars_export:
vars_no_mean = [x for x in df_cont.columns if x not in vars_to_mean]
df_cont_mean_centered = pd.concat((df_cont[vars_no_mean], df_cont[vars_to_mean] - df_cont[vars_to_mean].mean()), axis = 1)
df_cont_mean_centered[df_cont_mean_centered.month != 2].to_csv('2021-01-03 clean_data_w_dummies_centered.csv')
df_newest = df[df.month==12]
df_newest = df_newest.set_index('country')
df_newest['mobility_index_avg'] = df.groupby('country').mobility_index.mean()
df_newest['containment_health_index_avg'] = df.groupby('country').containment_health_index.mean()
# Adding new changes to the model
def get_interactions(df, month_12=False):
if month_12:
df['individualism_gov'] = df['individualism_index'] * df['containment_health_index_avg']
df['individualism_mob'] = df['individualism_index'] * df['mobility_index_avg']
else:
df['individualism_gov'] = df['individualism_index'] * df['containment_health_index']
df['individualism_mob'] = df['individualism_index'] * df['mobility_index']
return df
df_newest = get_interactions(df_newest, month_12=True)
df_newest.columns
Index(['month', 'individualism_index', 'confirmed', 'dead', 'population',
'health_expenditure', 'gdp_per_capita',
'population_ages_65_and_above_total', 'population_density',
'over65_per_capita', 'population_total', 'new_cases_per_100k',
'new_cases_total', 'new_deaths_total', 'new_cases_total_log',
'new_cases_per_100k_log', 'new_deaths_total_log', 'mobility_index',
'containment_health_index', 'democracy_index', 'capital', 'distance',
'continent', 'confirmed_log', 'cases_neighbors', 'distance_log',
'individualism_gov', 'individualism_mob', 'health_expenditure_log',
'gdp_per_capita_log', 'season', 'mobility_index_avg',
'containment_health_index_avg'],
dtype='object')
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov',
'cases_neighbors']
fig, ax, = plt.subplots(figsize=(12,7))
sns.heatmap(df_newest[vars_to_use].corr(), annot=True)
<AxesSubplot:>
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index_avg', 'containment_health_index_avg', 'democracy_index', 'distance_log', 'continent', 'individualism_gov', 'cases_neighbors']
results_total2 = smf.ols('confirmed_log ~ ' + ' + '.join(vars_to_use), data = df_newest[df_newest != 'Tanzania']).fit()
results_total = smf.ols('confirmed_log ~ ' + ' + '.join(vars_to_use), data = df_newest).fit()
results_total.summary()
| Dep. Variable: | confirmed_log | R-squared: | 0.519 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.401 |
| Method: | Least Squares | F-statistic: | 4.390 |
| Date: | Thu, 07 Jan 2021 | Prob (F-statistic): | 3.11e-05 |
| Time: | 08:16:26 | Log-Likelihood: | -126.84 |
| No. Observations: | 72 | AIC: | 283.7 |
| Df Residuals: | 57 | BIC: | 317.8 |
| Df Model: | 14 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | -0.6880 | 7.282 | -0.094 | 0.925 | -15.269 | 13.893 |
| continent[T.AS] | 2.2030 | 1.114 | 1.977 | 0.053 | -0.028 | 4.434 |
| continent[T.EU] | 1.5799 | 1.001 | 1.578 | 0.120 | -0.424 | 3.584 |
| continent[T.N.A.] | 0.6526 | 0.992 | 0.658 | 0.513 | -1.334 | 2.639 |
| continent[T.OC] | -2.3992 | 1.457 | -1.647 | 0.105 | -5.317 | 0.519 |
| continent[T.SA] | 1.9441 | 1.122 | 1.732 | 0.089 | -0.303 | 4.191 |
| individualism_index | -0.0312 | 0.067 | -0.464 | 0.645 | -0.166 | 0.104 |
| gdp_per_capita | -7.994e-06 | 1.3e-05 | -0.615 | 0.541 | -3.4e-05 | 1.8e-05 |
| population_density | -0.0001 | 0.000 | -0.657 | 0.514 | -0.001 | 0.000 |
| mobility_index_avg | 0.0098 | 0.024 | 0.406 | 0.686 | -0.039 | 0.058 |
| containment_health_index_avg | 0.0597 | 0.055 | 1.084 | 0.283 | -0.051 | 0.170 |
| democracy_index | -0.0118 | 0.017 | -0.713 | 0.479 | -0.045 | 0.021 |
| distance_log | 0.8928 | 0.719 | 1.242 | 0.219 | -0.547 | 2.333 |
| individualism_gov | 0.0013 | 0.001 | 1.039 | 0.303 | -0.001 | 0.004 |
| cases_neighbors | 0.0002 | 0.001 | 0.420 | 0.676 | -0.001 | 0.001 |
| Omnibus: | 7.626 | Durbin-Watson: | 1.731 |
|---|---|---|---|
| Prob(Omnibus): | 0.022 | Jarque-Bera (JB): | 7.171 |
| Skew: | -0.620 | Prob(JB): | 0.0277 |
| Kurtosis: | 3.922 | Cond. No. | 1.34e+06 |
results_df = pd.DataFrame()
results_df = results_df.append(results.pvalues, ignore_index=True)
params_ = results.params
p_ = results.pvalues
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index_avg', 'containment_health_index_avg', 'democracy_index', 'distance_log', 'continent', 'individualism_gov', 'cases_neighbors']
df_newest2 = df_newest[df_newest.index != 'Tanzania']
results_total2 = smf.ols('confirmed_log ~ ' + ' + '.join(vars_to_use), data = df_newest[df_newest.index != 'Tanzania']).fit()
results_total2.summary()
| Dep. Variable: | confirmed_log | R-squared: | 0.470 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.338 |
| Method: | Least Squares | F-statistic: | 3.553 |
| Date: | Thu, 07 Jan 2021 | Prob (F-statistic): | 0.000348 |
| Time: | 14:42:34 | Log-Likelihood: | -124.36 |
| No. Observations: | 71 | AIC: | 278.7 |
| Df Residuals: | 56 | BIC: | 312.7 |
| Df Model: | 14 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 1.8644 | 7.447 | 0.250 | 0.803 | -13.053 | 16.782 |
| continent[T.AS] | 2.0664 | 1.109 | 1.863 | 0.068 | -0.155 | 4.288 |
| continent[T.EU] | 1.4363 | 0.998 | 1.439 | 0.156 | -0.563 | 3.435 |
| continent[T.N.A.] | 0.5697 | 0.985 | 0.578 | 0.565 | -1.404 | 2.544 |
| continent[T.OC] | -2.6646 | 1.457 | -1.829 | 0.073 | -5.584 | 0.255 |
| continent[T.SA] | 1.8619 | 1.114 | 1.671 | 0.100 | -0.370 | 4.094 |
| individualism_index | -0.0606 | 0.070 | -0.866 | 0.390 | -0.201 | 0.080 |
| gdp_per_capita | -8.437e-06 | 1.29e-05 | -0.654 | 0.516 | -3.43e-05 | 1.74e-05 |
| population_density | -0.0001 | 0.000 | -0.665 | 0.509 | -0.001 | 0.000 |
| mobility_index_avg | 0.0047 | 0.024 | 0.196 | 0.846 | -0.044 | 0.053 |
| containment_health_index_avg | 0.0180 | 0.062 | 0.290 | 0.773 | -0.107 | 0.143 |
| democracy_index | -0.0109 | 0.016 | -0.662 | 0.511 | -0.044 | 0.022 |
| distance_log | 0.8620 | 0.713 | 1.208 | 0.232 | -0.567 | 2.291 |
| individualism_gov | 0.0018 | 0.001 | 1.437 | 0.156 | -0.001 | 0.004 |
| cases_neighbors | 0.0002 | 0.001 | 0.264 | 0.793 | -0.001 | 0.001 |
| Omnibus: | 9.036 | Durbin-Watson: | 1.726 |
|---|---|---|---|
| Prob(Omnibus): | 0.011 | Jarque-Bera (JB): | 8.788 |
| Skew: | -0.715 | Prob(JB): | 0.0124 |
| Kurtosis: | 3.962 | Cond. No. | 1.38e+06 |
result_summ = summary_col([results_total2], stars=True)
res_df = result_summ.tables[0]
res_df.columns = ['Cumulative']
with open('cumulative_regression.tex','w') as tf:
tf.write(res_df.to_latex())
df.columns
Index(['country', 'month', 'individualism_index', 'confirmed', 'dead',
'population', 'health_expenditure', 'gdp_per_capita',
'population_ages_65_and_above_total', 'population_density',
'over65_per_capita', 'population_total', 'new_cases_per_100k',
'new_cases_total', 'new_deaths_total', 'new_cases_total_log',
'new_cases_per_100k_log', 'new_deaths_total_log', 'mobility_index',
'containment_health_index', 'democracy_index', 'capital', 'distance',
'continent', 'confirmed_log', 'cases_neighbors', 'distance_log',
'individualism_gov', 'individualism_mob', 'health_expenditure_log',
'gdp_per_capita_log'],
dtype='object')
df['containment_health_index'].max()
87.693
'new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use)
'new_cases_per_100k_log ~ individualism_index + gdp_per_capita + population_density + mobility_index + containment_health_index + democracy_index + distance_log + continent + individualism_gov + cases_neighbors'
result_dict = {}
df3 = df[df.country != 'Tanzania'].copy()
# Create many variables in a for-loop
monthly_vars = {}
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov',
'cases_neighbors']
for i in range(2,13):
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df[df.month == i]).fit()
monthly_vars['month_{}'.format(i)] = results
for k,v in monthly_vars.items():
monthly_vars[k] = v.get_robustcov_results()
rs_list = list(monthly_vars.values())
result_summ = summary_col(rs_list[0:6], stars=True)
res_df1 = result_summ.tables[0]
res_df1.columns = [f'Month_{x}' for x in range(2,8)]
with open('summary_monthly1.tex','w') as tf:
tf.write(res_df1.to_latex())
result_summ = summary_col(rs_list[6:], stars=True)
res_df2 = result_summ.tables[0]
res_df2.columns = [f'Month_{x}' for x in range(8,13)]
with open('summary_monthly2.tex','w') as tf:
tf.write(res_df2.to_latex())
res_df1 = results_months1.tables[0]
res_df1.columns = [f'Month_{x}' for x in range(2,8)]
res_df2 = results_months2.tables[0]
res_df2.columns = [f'Month_{x}' for x in range(8,13)]
with open('summary_seasonal.tex','w') as tf:
tf.write(res_df.to_latex())
df3 = df[df.country != 'Tanzania'].copy()
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov',
'cases_neighbors']
for month in range(2,13):
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df[df.month == month]).fit()
print(f"\n\n------------------- MONTH: {month} -----------------------")
print(results.summary())
result_dict[str(month) + '_pvalues'] = results.pvalues
result_dict[str(month) + '_params'] = results.params
------------------- MONTH: 2 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.442
Model: OLS Adj. R-squared: 0.305
Method: Least Squares F-statistic: 3.222
Date: Thu, 07 Jan 2021 Prob (F-statistic): 0.000883
Time: 15:04:27 Log-Likelihood: -202.82
No. Observations: 72 AIC: 435.6
Df Residuals: 57 BIC: 469.8
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 37.8081 23.409 1.615 0.112 -9.067 84.683
continent[T.AS] -2.7975 3.214 -0.870 0.388 -9.233 3.638
continent[T.EU] -0.9357 2.371 -0.395 0.695 -5.684 3.813
continent[T.N.A.] 0.5199 2.528 0.206 0.838 -4.543 5.583
continent[T.OC] -0.5764 4.376 -0.132 0.896 -9.338 8.186
continent[T.SA] 1.2658 3.044 0.416 0.679 -4.831 7.362
individualism_index 0.0022 0.052 0.042 0.967 -0.102 0.107
gdp_per_capita 6.134e-05 3.89e-05 1.577 0.120 -1.65e-05 0.000
population_density 0.0004 0.001 0.609 0.545 -0.001 0.002
mobility_index 0.0700 0.150 0.466 0.643 -0.231 0.371
containment_health_index -0.0772 0.160 -0.482 0.632 -0.398 0.244
democracy_index 0.0116 0.050 0.232 0.818 -0.089 0.112
distance_log -5.4697 2.426 -2.254 0.028 -10.328 -0.611
individualism_gov 0.0043 0.003 1.335 0.187 -0.002 0.011
cases_neighbors 0.4718 2.088 0.226 0.822 -3.709 4.653
==============================================================================
Omnibus: 2.587 Durbin-Watson: 2.640
Prob(Omnibus): 0.274 Jarque-Bera (JB): 1.577
Skew: -0.042 Prob(JB): 0.454
Kurtosis: 2.280 Cond. No. 1.50e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.5e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 3 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.788
Model: OLS Adj. R-squared: 0.737
Method: Least Squares F-statistic: 15.18
Date: Thu, 07 Jan 2021 Prob (F-statistic): 2.22e-14
Time: 15:04:27 Log-Likelihood: -101.45
No. Observations: 72 AIC: 232.9
Df Residuals: 57 BIC: 267.0
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -12.5020 5.346 -2.339 0.023 -23.207 -1.797
continent[T.AS] 1.8000 0.800 2.249 0.028 0.197 3.403
continent[T.EU] 2.2048 0.554 3.983 0.000 1.096 3.313
continent[T.N.A.] 0.7048 0.613 1.149 0.255 -0.524 1.933
continent[T.OC] 1.1584 1.002 1.156 0.253 -0.848 3.165
continent[T.SA] 0.3635 0.743 0.489 0.627 -1.125 1.852
individualism_index 0.0403 0.030 1.360 0.179 -0.019 0.100
gdp_per_capita 3.353e-05 9.21e-06 3.642 0.001 1.51e-05 5.2e-05
population_density 6.206e-05 0.000 0.397 0.693 -0.000 0.000
mobility_index -0.0509 0.022 -2.326 0.024 -0.095 -0.007
containment_health_index 0.0256 0.032 0.788 0.434 -0.039 0.090
democracy_index 0.0153 0.012 1.283 0.205 -0.009 0.039
distance_log 0.9424 0.545 1.731 0.089 -0.148 2.033
individualism_gov -0.0009 0.001 -1.198 0.236 -0.002 0.001
cases_neighbors 0.0040 0.004 1.017 0.313 -0.004 0.012
==============================================================================
Omnibus: 4.141 Durbin-Watson: 1.971
Prob(Omnibus): 0.126 Jarque-Bera (JB): 3.360
Skew: -0.395 Prob(JB): 0.186
Kurtosis: 3.705 Cond. No. 1.40e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.4e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 4 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.735
Model: OLS Adj. R-squared: 0.669
Method: Least Squares F-statistic: 11.26
Date: Thu, 07 Jan 2021 Prob (F-statistic): 9.64e-12
Time: 15:04:27 Log-Likelihood: -97.324
No. Observations: 72 AIC: 224.6
Df Residuals: 57 BIC: 258.8
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -10.1185 5.156 -1.963 0.055 -20.443 0.206
continent[T.AS] 2.0577 0.736 2.794 0.007 0.583 3.532
continent[T.EU] 2.6559 0.511 5.196 0.000 1.632 3.679
continent[T.N.A.] 1.2652 0.576 2.197 0.032 0.112 2.419
continent[T.OC] -0.2664 1.020 -0.261 0.795 -2.309 1.777
continent[T.SA] 1.4494 0.686 2.113 0.039 0.076 2.823
individualism_index 0.0192 0.048 0.402 0.689 -0.076 0.115
gdp_per_capita 2.599e-05 9.34e-06 2.782 0.007 7.28e-06 4.47e-05
population_density 0.0003 0.000 2.134 0.037 1.96e-05 0.001
mobility_index -0.0270 0.012 -2.244 0.029 -0.051 -0.003
containment_health_index -0.0269 0.028 -0.963 0.340 -0.083 0.029
democracy_index -0.0104 0.011 -0.955 0.344 -0.032 0.011
distance_log 1.2390 0.508 2.437 0.018 0.221 2.257
individualism_gov -4.832e-05 0.001 -0.069 0.945 -0.001 0.001
cases_neighbors 0.0007 0.003 0.286 0.776 -0.004 0.006
==============================================================================
Omnibus: 2.141 Durbin-Watson: 1.472
Prob(Omnibus): 0.343 Jarque-Bera (JB): 2.068
Skew: -0.399 Prob(JB): 0.356
Kurtosis: 2.772 Cond. No. 1.43e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.43e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 5 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.272
Model: OLS Adj. R-squared: 0.093
Method: Least Squares F-statistic: 1.518
Date: Thu, 07 Jan 2021 Prob (F-statistic): 0.134
Time: 15:04:27 Log-Likelihood: -161.90
No. Observations: 72 AIC: 353.8
Df Residuals: 57 BIC: 388.0
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -3.6385 12.290 -0.296 0.768 -28.250 20.973
continent[T.AS] 0.7656 1.818 0.421 0.675 -2.874 4.406
continent[T.EU] 1.8166 1.258 1.444 0.154 -0.702 4.336
continent[T.N.A.] 0.3459 1.479 0.234 0.816 -2.615 3.307
continent[T.OC] -2.3787 2.417 -0.984 0.329 -7.218 2.460
continent[T.SA] 2.5601 1.901 1.347 0.183 -1.247 6.367
individualism_index 0.0229 0.119 0.193 0.847 -0.214 0.260
gdp_per_capita 1.192e-05 2.22e-05 0.538 0.593 -3.25e-05 5.63e-05
population_density 0.0004 0.000 1.039 0.303 -0.000 0.001
mobility_index -0.0229 0.026 -0.881 0.382 -0.075 0.029
containment_health_index 0.0011 0.073 0.015 0.988 -0.145 0.147
democracy_index -0.0374 0.028 -1.348 0.183 -0.093 0.018
distance_log 0.5286 1.214 0.435 0.665 -1.902 2.959
individualism_gov 0.0002 0.002 0.141 0.888 -0.003 0.004
cases_neighbors 0.0028 0.006 0.442 0.660 -0.010 0.015
==============================================================================
Omnibus: 98.090 Durbin-Watson: 1.730
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2042.647
Skew: -4.075 Prob(JB): 0.00
Kurtosis: 27.788 Cond. No. 1.39e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.39e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 6 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.447
Model: OLS Adj. R-squared: 0.311
Method: Least Squares F-statistic: 3.290
Date: Thu, 07 Jan 2021 Prob (F-statistic): 0.000723
Time: 15:04:27 Log-Likelihood: -154.18
No. Observations: 72 AIC: 338.4
Df Residuals: 57 BIC: 372.5
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -22.8302 10.834 -2.107 0.040 -44.525 -1.135
continent[T.AS] 3.1305 1.642 1.906 0.062 -0.159 6.419
continent[T.EU] 3.5579 1.143 3.113 0.003 1.269 5.847
continent[T.N.A.] 1.7705 1.364 1.298 0.199 -0.961 4.502
continent[T.OC] 1.2898 2.408 0.536 0.594 -3.533 6.113
continent[T.SA] 2.2152 2.021 1.096 0.278 -1.831 6.262
individualism_index 0.0396 0.087 0.454 0.652 -0.135 0.214
gdp_per_capita 1.392e-05 1.95e-05 0.715 0.477 -2.5e-05 5.29e-05
population_density 0.0002 0.000 0.710 0.481 -0.000 0.001
mobility_index 0.0026 0.023 0.115 0.908 -0.043 0.048
containment_health_index 0.1374 0.058 2.358 0.022 0.021 0.254
democracy_index -0.0193 0.025 -0.776 0.441 -0.069 0.030
distance_log 1.7589 1.080 1.629 0.109 -0.404 3.922
individualism_gov -0.0007 0.001 -0.501 0.619 -0.003 0.002
cases_neighbors 0.0004 0.005 0.090 0.928 -0.009 0.010
==============================================================================
Omnibus: 45.252 Durbin-Watson: 1.657
Prob(Omnibus): 0.000 Jarque-Bera (JB): 232.495
Skew: -1.711 Prob(JB): 3.27e-51
Kurtosis: 11.111 Cond. No. 1.37e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.37e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 7 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.488
Model: OLS Adj. R-squared: 0.363
Method: Least Squares F-statistic: 3.885
Date: Thu, 07 Jan 2021 Prob (F-statistic): 0.000128
Time: 15:04:27 Log-Likelihood: -151.81
No. Observations: 72 AIC: 333.6
Df Residuals: 57 BIC: 367.8
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -19.9097 9.686 -2.055 0.044 -39.307 -0.513
continent[T.AS] 2.5675 1.525 1.683 0.098 -0.487 5.622
continent[T.EU] 3.8146 1.151 3.315 0.002 1.510 6.119
continent[T.N.A.] 1.1589 1.426 0.813 0.420 -1.696 4.014
continent[T.OC] 1.0554 2.078 0.508 0.613 -3.105 5.216
continent[T.SA] 0.8507 1.877 0.453 0.652 -2.909 4.610
individualism_index 0.0127 0.066 0.193 0.847 -0.119 0.145
gdp_per_capita 2.398e-05 1.87e-05 1.281 0.205 -1.35e-05 6.15e-05
population_density 0.0002 0.000 0.551 0.584 -0.000 0.001
mobility_index -0.0238 0.020 -1.186 0.241 -0.064 0.016
containment_health_index 0.1211 0.050 2.406 0.019 0.020 0.222
democracy_index -0.0109 0.023 -0.468 0.641 -0.058 0.036
distance_log 1.5452 1.005 1.538 0.130 -0.466 3.557
individualism_gov -0.0002 0.001 -0.223 0.824 -0.002 0.002
cases_neighbors 0.0013 0.003 0.409 0.684 -0.005 0.008
==============================================================================
Omnibus: 54.858 Durbin-Watson: 1.952
Prob(Omnibus): 0.000 Jarque-Bera (JB): 376.731
Skew: -2.058 Prob(JB): 1.56e-82
Kurtosis: 13.423 Cond. No. 1.26e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.26e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 8 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.502
Model: OLS Adj. R-squared: 0.380
Method: Least Squares F-statistic: 4.112
Date: Thu, 07 Jan 2021 Prob (F-statistic): 6.75e-05
Time: 15:04:27 Log-Likelihood: -148.60
No. Observations: 72 AIC: 327.2
Df Residuals: 57 BIC: 361.3
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -28.0291 9.325 -3.006 0.004 -46.702 -9.356
continent[T.AS] 4.0171 1.415 2.839 0.006 1.184 6.851
continent[T.EU] 4.3473 1.106 3.930 0.000 2.132 6.563
continent[T.N.A.] 1.9246 1.285 1.498 0.140 -0.648 4.498
continent[T.OC] 0.7637 1.949 0.392 0.697 -3.139 4.666
continent[T.SA] 1.3112 1.795 0.731 0.468 -2.283 4.905
individualism_index 0.0614 0.065 0.938 0.352 -0.070 0.192
gdp_per_capita -1.748e-05 1.75e-05 -0.999 0.322 -5.25e-05 1.76e-05
population_density 0.0004 0.000 1.252 0.216 -0.000 0.001
mobility_index -0.0077 0.017 -0.462 0.646 -0.041 0.026
containment_health_index 0.1360 0.049 2.793 0.007 0.038 0.234
democracy_index 0.0149 0.022 0.676 0.502 -0.029 0.059
distance_log 2.1582 0.950 2.271 0.027 0.255 4.061
individualism_gov -0.0008 0.001 -0.766 0.447 -0.003 0.001
cases_neighbors 0.0009 0.003 0.320 0.750 -0.005 0.007
==============================================================================
Omnibus: 56.594 Durbin-Watson: 1.892
Prob(Omnibus): 0.000 Jarque-Bera (JB): 411.709
Skew: -2.120 Prob(JB): 3.97e-90
Kurtosis: 13.920 Cond. No. 1.27e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.27e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 9 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.599
Model: OLS Adj. R-squared: 0.501
Method: Least Squares F-statistic: 6.082
Date: Thu, 07 Jan 2021 Prob (F-statistic): 3.85e-07
Time: 15:04:27 Log-Likelihood: -142.87
No. Observations: 72 AIC: 315.7
Df Residuals: 57 BIC: 349.9
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -31.0789 8.626 -3.603 0.001 -48.352 -13.806
continent[T.AS] 4.3768 1.343 3.259 0.002 1.688 7.066
continent[T.EU] 4.9054 0.994 4.935 0.000 2.915 6.896
continent[T.N.A.] 1.7243 1.149 1.501 0.139 -0.576 4.024
continent[T.OC] -0.2881 1.781 -0.162 0.872 -3.855 3.279
continent[T.SA] 1.3408 1.393 0.962 0.340 -1.449 4.131
individualism_index 0.0727 0.057 1.274 0.208 -0.042 0.187
gdp_per_capita -7.089e-06 1.62e-05 -0.438 0.663 -3.95e-05 2.54e-05
population_density 7.236e-05 0.000 0.261 0.795 -0.000 0.001
mobility_index -0.0125 0.023 -0.549 0.585 -0.058 0.033
containment_health_index 0.1366 0.046 2.990 0.004 0.045 0.228
democracy_index 0.0083 0.021 0.406 0.687 -0.033 0.050
distance_log 2.5093 0.893 2.811 0.007 0.722 4.297
individualism_gov -0.0010 0.001 -1.028 0.309 -0.003 0.001
cases_neighbors 0.0015 0.002 0.829 0.411 -0.002 0.005
==============================================================================
Omnibus: 63.478 Durbin-Watson: 1.803
Prob(Omnibus): 0.000 Jarque-Bera (JB): 540.896
Skew: -2.413 Prob(JB): 3.52e-118
Kurtosis: 15.531 Cond. No. 1.27e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.27e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 10 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.680
Model: OLS Adj. R-squared: 0.601
Method: Least Squares F-statistic: 8.635
Date: Thu, 07 Jan 2021 Prob (F-statistic): 1.32e-09
Time: 15:04:27 Log-Likelihood: -140.89
No. Observations: 72 AIC: 311.8
Df Residuals: 57 BIC: 345.9
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -31.9953 8.435 -3.793 0.000 -48.886 -15.105
continent[T.AS] 4.2210 1.357 3.110 0.003 1.503 6.939
continent[T.EU] 5.5574 1.018 5.461 0.000 3.520 7.595
continent[T.N.A.] 2.6310 1.069 2.461 0.017 0.491 4.771
continent[T.OC] -0.4809 1.719 -0.280 0.781 -3.922 2.961
continent[T.SA] 1.7679 1.292 1.369 0.176 -0.818 4.354
individualism_index 0.1283 0.055 2.352 0.022 0.019 0.238
gdp_per_capita -9.235e-06 1.63e-05 -0.565 0.574 -4.2e-05 2.35e-05
population_density -0.0001 0.000 -0.424 0.673 -0.001 0.000
mobility_index 0.0019 0.023 0.082 0.935 -0.044 0.047
containment_health_index 0.1805 0.046 3.949 0.000 0.089 0.272
democracy_index 0.0040 0.020 0.197 0.845 -0.036 0.044
distance_log 2.4402 0.888 2.748 0.008 0.662 4.218
individualism_gov -0.0020 0.001 -2.147 0.036 -0.004 -0.000
cases_neighbors 0.0004 0.001 0.414 0.681 -0.002 0.002
==============================================================================
Omnibus: 56.865 Durbin-Watson: 1.731
Prob(Omnibus): 0.000 Jarque-Bera (JB): 438.806
Skew: -2.103 Prob(JB): 5.18e-96
Kurtosis: 14.339 Cond. No. 1.28e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.28e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 11 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.719
Model: OLS Adj. R-squared: 0.650
Method: Least Squares F-statistic: 10.42
Date: Thu, 07 Jan 2021 Prob (F-statistic): 4.25e-11
Time: 15:04:27 Log-Likelihood: -138.88
No. Observations: 72 AIC: 307.8
Df Residuals: 57 BIC: 341.9
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -31.1961 8.159 -3.823 0.000 -47.534 -14.858
continent[T.AS] 3.7483 1.344 2.790 0.007 1.058 6.439
continent[T.EU] 4.5439 1.220 3.725 0.000 2.102 6.986
continent[T.N.A.] 3.0366 1.069 2.842 0.006 0.897 5.177
continent[T.OC] -2.4472 1.901 -1.287 0.203 -6.254 1.359
continent[T.SA] 2.0639 1.250 1.652 0.104 -0.439 4.566
individualism_index 0.2274 0.066 3.422 0.001 0.094 0.361
gdp_per_capita -5.254e-06 1.55e-05 -0.340 0.735 -3.62e-05 2.57e-05
population_density -0.0002 0.000 -0.848 0.400 -0.001 0.000
mobility_index 0.0359 0.027 1.322 0.191 -0.018 0.090
containment_health_index 0.2481 0.049 5.056 0.000 0.150 0.346
democracy_index -0.0060 0.020 -0.298 0.767 -0.046 0.034
distance_log 2.0746 0.856 2.423 0.019 0.360 3.789
individualism_gov -0.0037 0.001 -3.368 0.001 -0.006 -0.001
cases_neighbors 0.0011 0.001 1.589 0.118 -0.000 0.002
==============================================================================
Omnibus: 55.799 Durbin-Watson: 1.935
Prob(Omnibus): 0.000 Jarque-Bera (JB): 414.083
Skew: -2.067 Prob(JB): 1.21e-90
Kurtosis: 13.997 Cond. No. 1.28e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.28e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- MONTH: 12 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.624
Model: OLS Adj. R-squared: 0.532
Method: Least Squares F-statistic: 6.769
Date: Thu, 07 Jan 2021 Prob (F-statistic): 7.54e-08
Time: 15:04:27 Log-Likelihood: -145.85
No. Observations: 72 AIC: 321.7
Df Residuals: 57 BIC: 355.9
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -30.5457 9.084 -3.363 0.001 -48.736 -12.356
continent[T.AS] 3.3357 1.480 2.254 0.028 0.372 6.299
continent[T.EU] 3.3807 1.360 2.485 0.016 0.656 6.105
continent[T.N.A.] 1.8079 1.243 1.455 0.151 -0.681 4.297
continent[T.OC] -2.8923 2.036 -1.420 0.161 -6.970 1.185
continent[T.SA] 1.2758 1.416 0.901 0.371 -1.559 4.111
individualism_index 0.1922 0.074 2.596 0.012 0.044 0.340
gdp_per_capita -2.454e-06 1.71e-05 -0.144 0.886 -3.66e-05 3.17e-05
population_density -0.0002 0.000 -0.751 0.455 -0.001 0.000
mobility_index 0.0171 0.030 0.563 0.575 -0.044 0.078
containment_health_index 0.2049 0.054 3.814 0.000 0.097 0.312
democracy_index 3.136e-05 0.022 0.001 0.999 -0.044 0.044
distance_log 2.2504 0.915 2.459 0.017 0.418 4.083
individualism_gov -0.0030 0.001 -2.487 0.016 -0.005 -0.001
cases_neighbors 0.0011 0.001 1.394 0.169 -0.000 0.003
==============================================================================
Omnibus: 64.311 Durbin-Watson: 2.066
Prob(Omnibus): 0.000 Jarque-Bera (JB): 581.013
Skew: -2.426 Prob(JB): 6.83e-127
Kurtosis: 16.043 Cond. No. 1.29e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.29e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
result_dict.keys()
dict_keys(['2_pvalues', '2_params', '3_pvalues', '3_params', '4_pvalues', '4_params', '5_pvalues', '5_params', '6_pvalues', '6_params', '7_pvalues', '7_params', '8_pvalues', '8_params', '9_pvalues', '9_params', '10_pvalues', '10_params', '11_pvalues', '11_params', '12_pvalues', '12_params'])
pvalues_df = pd.DataFrame()
coeff_df = pd.DataFrame()
for key in result_dict.keys():
if '_pvalues' in key:
result_series = result_dict[key]
pvalues_df = pvalues_df.append(result_series, ignore_index=True)
if '_params' in key:
params_series = result_dict[key]
coeff_df = coeff_df.append(params_series, ignore_index=True)
pvalues_df.index = range(2,13)
coeff_df.index = range(2,13)
seasons_dict = {'2': 'winter_1',
'3': 'spring',
'4': 'spring',
'5': 'spring',
'6': 'summer',
'7': 'summer',
'8': 'summer',
'9': 'autumn',
'10': 'autumn',
'11': 'autumn',
'12': 'winter_2'}
seasons_dict = {int(k):v for k,v in seasons_dict.items()}
df['season'] = df['month'].map(seasons_dict)
vars_to_agg = ['individualism_index', 'health_expenditure', 'gdp_per_capita', 'over65_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'individualism_gov', 'cases_neighbors']
dict_for_agg = {v:'mean' for v in vars_to_agg}
dict_for_agg.update({'new_cases_per_100k': 'sum'})
df_seasons = df.groupby(['country', 'season', 'continent']).agg(dict_for_agg).reset_index()
df_seasons['new_cases_per_100k_log'] = np.log(df_seasons['new_cases_per_100k'] + 0.0000000001)
df_seasons = df_seasons.set_index('country')
from statsmodels.iolib.summary2 import summary_col
df.season.unique()
array(['winter_1', 'spring', 'summer', 'autumn', 'winter_2'], dtype=object)
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov', 'cases_neighbors']
results1 = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons.season == 'winter_1']).fit()
results2 = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons.season == 'spring']).fit()
results3 = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons.season == 'summer']).fit()
results4 = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons.season == 'autumn']).fit()
results5 = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons.season == 'winter_2']).fit()
def get_season_vars(with_tanzania=True):
season_vars = {}
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov',
'cases_neighbors']
if with_tanzania:
for season in ['winter_1', 'spring', 'summer', 'autumn', 'winter_2']:
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons['season'] == season]).fit()
season_vars['{}'.format(season)] = results
else:
for season in ['winter_1', 'spring', 'summer', 'autumn', 'winter_2']:
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[
(df_seasons['season'] == season) & (df_seasons.index != 'Tanzania')]).fit()
season_vars['{}'.format(season)] = results
return season_vars
season_vars = get_season_vars(with_tanzania=False)
season_vars
{'winter_1': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2198e28a7f0>,
'spring': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2198e299ca0>,
'summer': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2198dfedd00>,
'autumn': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2198e2b2cd0>,
'winter_2': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2198e2bf4f0>}
season_vars2 = get_season_vars(with_tanzania=False)
season_vars2['cumulative'] = results_total2
keys_list = list(season_vars2.keys())
keys_list.insert(0, keys_list.pop())
season_vars2 = dict(sorted(season_vars2.items(), key = lambda pair: keys_list.index(pair[0])))
for key, value in season_vars2.items():
season_vars2[key] = value.get_robustcov_results()
season_vars2
{'cumulative': <statsmodels.regression.linear_model.OLSResults at 0x2299ae475b0>,
'winter_1': <statsmodels.regression.linear_model.OLSResults at 0x2299dd05b80>,
'spring': <statsmodels.regression.linear_model.OLSResults at 0x2299d5322b0>,
'summer': <statsmodels.regression.linear_model.OLSResults at 0x2299d8a6d60>,
'autumn': <statsmodels.regression.linear_model.OLSResults at 0x2299d534040>,
'winter_2': <statsmodels.regression.linear_model.OLSResults at 0x2299d9401f0>}
rs_list = list(season_vars2.values())[1:]
result_summ = summary_col(rs_list, stars=True)
res_df = result_summ.tables[0]
res_df.columns = ['Winter1', 'Spring', 'Summer', 'Autumn', 'Winter2']
res_df
| Cumulative | |
|---|---|
| Intercept | 1.8644 |
| (7.4468) | |
| continent[T.AS] | 2.0664* |
| (1.1091) | |
| continent[T.EU] | 1.4363 |
| (0.9978) | |
| continent[T.N.A.] | 0.5697 |
| (0.9854) | |
| continent[T.OC] | -2.6646* |
| (1.4573) | |
| continent[T.SA] | 1.8619 |
| (1.1143) | |
| individualism_index | -0.0606 |
| (0.0700) | |
| gdp_per_capita | -0.0000 |
| (0.0000) | |
| population_density | -0.0001 |
| (0.0002) | |
| mobility_index_avg | 0.0047 |
| (0.0243) | |
| containment_health_index_avg | 0.0180 |
| (0.0622) | |
| democracy_index | -0.0109 |
| (0.0164) | |
| distance_log | 0.8620 |
| (0.7133) | |
| individualism_gov | 0.0018 |
| (0.0013) | |
| cases_neighbors | 0.0002 |
| (0.0006) | |
| R-squared | 0.4704 |
| R-squared Adj. | 0.3380 |
with open('summary_seasonal.tex','w') as tf:
tf.write(res_df.to_latex())
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov', 'cases_neighbors']
for season in df.season.unique():
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons.season == season]).fit()
print(f"\n\n------------------- Season: {season} -----------------------")
print(results.summary())
season_vars = get_season_vars(with_tanzania=False)
------------------- Season: winter_1 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.414
Model: OLS Adj. R-squared: 0.270
Method: Least Squares F-statistic: 2.879
Date: Tue, 05 Jan 2021 Prob (F-statistic): 0.00246
Time: 22:57:08 Log-Likelihood: -248.22
No. Observations: 72 AIC: 526.4
Df Residuals: 57 BIC: 560.6
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 76.3746 43.976 1.737 0.088 -11.687 164.436
continent[T.AS] -5.0514 6.038 -0.837 0.406 -17.141 7.039
continent[T.EU] -2.1679 4.455 -0.487 0.628 -11.089 6.753
continent[T.N.A.] 1.3849 4.750 0.292 0.772 -8.127 10.896
continent[T.OC] -1.5500 8.220 -0.189 0.851 -18.010 14.910
continent[T.SA] 2.7905 5.719 0.488 0.627 -8.662 14.243
individualism_index 0.0237 0.098 0.241 0.810 -0.173 0.220
gdp_per_capita 9.923e-05 7.31e-05 1.358 0.180 -4.71e-05 0.000
population_density 0.0005 0.001 0.441 0.661 -0.002 0.003
mobility_index 0.1675 0.282 0.593 0.556 -0.398 0.733
containment_health_index -0.1867 0.301 -0.620 0.538 -0.789 0.416
democracy_index 0.0136 0.094 0.145 0.886 -0.175 0.203
distance_log -10.4792 4.558 -2.299 0.025 -19.606 -1.352
individualism_gov 0.0079 0.006 1.298 0.200 -0.004 0.020
cases_neighbors 1.5541 3.923 0.396 0.693 -6.301 9.409
==============================================================================
Omnibus: 3.002 Durbin-Watson: 2.651
Prob(Omnibus): 0.223 Jarque-Bera (JB): 1.705
Skew: 0.026 Prob(JB): 0.426
Kurtosis: 2.248 Cond. No. 1.50e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.5e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: spring -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.656
Model: OLS Adj. R-squared: 0.571
Method: Least Squares F-statistic: 7.761
Date: Tue, 05 Jan 2021 Prob (F-statistic): 8.21e-09
Time: 22:57:08 Log-Likelihood: -97.796
No. Observations: 72 AIC: 225.6
Df Residuals: 57 BIC: 259.7
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -6.9245 5.183 -1.336 0.187 -17.304 3.455
continent[T.AS] 1.5335 0.743 2.064 0.044 0.046 3.021
continent[T.EU] 2.1367 0.523 4.085 0.000 1.089 3.184
continent[T.N.A.] 1.2364 0.587 2.108 0.039 0.062 2.411
continent[T.OC] -0.1233 0.990 -0.124 0.901 -2.107 1.860
continent[T.SA] 1.7343 0.703 2.467 0.017 0.327 3.142
individualism_index 0.0125 0.049 0.256 0.799 -0.085 0.110
gdp_per_capita 2.505e-05 9.28e-06 2.700 0.009 6.47e-06 4.36e-05
population_density 0.0003 0.000 1.956 0.055 -7.08e-06 0.001
mobility_index -0.0205 0.015 -1.370 0.176 -0.050 0.009
containment_health_index -0.0100 0.033 -0.299 0.766 -0.077 0.057
democracy_index -0.0148 0.011 -1.336 0.187 -0.037 0.007
distance_log 1.0111 0.511 1.980 0.053 -0.012 2.034
individualism_gov -6.548e-06 0.001 -0.008 0.994 -0.002 0.002
cases_neighbors 0.0024 0.004 0.623 0.536 -0.005 0.010
==============================================================================
Omnibus: 2.079 Durbin-Watson: 1.427
Prob(Omnibus): 0.354 Jarque-Bera (JB): 1.643
Skew: -0.368 Prob(JB): 0.440
Kurtosis: 3.074 Cond. No. 1.43e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.43e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: summer -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.423
Model: OLS Adj. R-squared: 0.282
Method: Least Squares F-statistic: 2.991
Date: Tue, 05 Jan 2021 Prob (F-statistic): 0.00176
Time: 22:57:08 Log-Likelihood: -176.08
No. Observations: 72 AIC: 382.2
Df Residuals: 57 BIC: 416.3
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -27.1022 13.886 -1.952 0.056 -54.908 0.704
continent[T.AS] 4.1260 2.140 1.928 0.059 -0.160 8.411
continent[T.EU] 5.1252 1.608 3.188 0.002 1.906 8.345
continent[T.N.A.] 2.2478 1.952 1.152 0.254 -1.660 6.156
continent[T.OC] 2.4160 2.894 0.835 0.407 -3.379 8.211
continent[T.SA] 1.6003 2.785 0.575 0.568 -3.976 7.176
individualism_index 0.0784 0.103 0.763 0.449 -0.127 0.284
gdp_per_capita 1.223e-05 2.6e-05 0.469 0.641 -3.99e-05 6.44e-05
population_density 0.0002 0.000 0.539 0.592 -0.001 0.001
mobility_index -0.0060 0.029 -0.206 0.837 -0.064 0.052
containment_health_index 0.2036 0.075 2.713 0.009 0.053 0.354
democracy_index -0.0033 0.033 -0.102 0.919 -0.069 0.062
distance_log 1.7596 1.417 1.242 0.219 -1.077 4.596
individualism_gov -0.0013 0.002 -0.802 0.426 -0.005 0.002
cases_neighbors 0.0019 0.006 0.346 0.731 -0.009 0.013
==============================================================================
Omnibus: 91.401 Durbin-Watson: 1.897
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1778.080
Skew: -3.649 Prob(JB): 0.00
Kurtosis: 26.226 Cond. No. 1.29e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.29e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: autumn -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.595
Model: OLS Adj. R-squared: 0.495
Method: Least Squares F-statistic: 5.977
Date: Tue, 05 Jan 2021 Prob (F-statistic): 4.97e-07
Time: 22:57:08 Log-Likelihood: -168.86
No. Observations: 72 AIC: 367.7
Df Residuals: 57 BIC: 401.9
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -37.2632 12.524 -2.975 0.004 -62.343 -12.183
continent[T.AS] 4.6602 2.021 2.306 0.025 0.614 8.706
continent[T.EU] 5.6948 1.664 3.423 0.001 2.363 9.027
continent[T.N.A.] 3.2374 1.627 1.989 0.051 -0.022 6.496
continent[T.OC] 0.5475 2.576 0.213 0.832 -4.611 5.706
continent[T.SA] 2.4500 1.939 1.263 0.212 -1.433 6.333
individualism_index 0.2448 0.095 2.574 0.013 0.054 0.435
gdp_per_capita -8.304e-06 2.35e-05 -0.354 0.725 -5.53e-05 3.87e-05
population_density -4.507e-05 0.000 -0.114 0.910 -0.001 0.001
mobility_index 0.0389 0.038 1.024 0.310 -0.037 0.115
containment_health_index 0.3267 0.074 4.422 0.000 0.179 0.475
democracy_index -0.0029 0.030 -0.096 0.924 -0.063 0.057
distance_log 2.2470 1.314 1.710 0.093 -0.384 4.878
individualism_gov -0.0041 0.002 -2.550 0.013 -0.007 -0.001
cases_neighbors 0.0014 0.002 0.819 0.416 -0.002 0.005
==============================================================================
Omnibus: 79.610 Durbin-Watson: 1.974
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1148.897
Skew: -3.064 Prob(JB): 3.31e-250
Kurtosis: 21.586 Cond. No. 1.29e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.29e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: winter_2 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.547
Model: OLS Adj. R-squared: 0.435
Method: Least Squares F-statistic: 4.909
Date: Tue, 05 Jan 2021 Prob (F-statistic): 7.62e-06
Time: 22:57:08 Log-Likelihood: -171.08
No. Observations: 72 AIC: 372.2
Df Residuals: 57 BIC: 406.3
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -37.5908 12.896 -2.915 0.005 -63.414 -11.768
continent[T.AS] 3.9128 2.101 1.863 0.068 -0.294 8.119
continent[T.EU] 4.1091 1.931 2.128 0.038 0.242 7.977
continent[T.N.A.] 2.6432 1.765 1.498 0.140 -0.890 6.177
continent[T.OC] -2.1184 2.891 -0.733 0.467 -7.907 3.670
continent[T.SA] 1.8784 2.010 0.935 0.354 -2.146 5.903
individualism_index 0.2750 0.105 2.617 0.011 0.065 0.486
gdp_per_capita 1.95e-06 2.42e-05 0.081 0.936 -4.65e-05 5.04e-05
population_density -0.0002 0.000 -0.553 0.583 -0.001 0.001
mobility_index 0.0461 0.043 1.071 0.289 -0.040 0.132
containment_health_index 0.3122 0.076 4.095 0.000 0.160 0.465
democracy_index -0.0116 0.031 -0.371 0.712 -0.074 0.051
distance_log 2.3637 1.299 1.819 0.074 -0.238 4.966
individualism_gov -0.0044 0.002 -2.586 0.012 -0.008 -0.001
cases_neighbors 0.0014 0.001 1.269 0.210 -0.001 0.004
==============================================================================
Omnibus: 80.030 Durbin-Watson: 2.131
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1153.113
Skew: -3.091 Prob(JB): 4.03e-251
Kurtosis: 21.605 Cond. No. 1.29e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.29e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
def get_season_vars(with_tanzania=True, vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov',
'cases_neighbors']):
season_vars = {}
if with_tanzania:
for season in ['winter_1', 'spring', 'summer', 'autumn', 'winter_2']:
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons['season'] == season]).fit()
season_vars['{}'.format(season)] = results
else:
for season in ['winter_1', 'spring', 'summer', 'autumn', 'winter_2']:
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[
(df_seasons['season'] == season) & (df_seasons.index != 'Tanzania')]).fit()
season_vars['{}'.format(season)] = results
return season_vars
season_vars = get_season_vars(with_tanzania=False, vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'individualism_gov',
'cases_neighbors'])
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index_avg', 'containment_health_index_avg', 'democracy_index', 'distance_log', 'individualism_gov', 'cases_neighbors']
results_total_no_continent = smf.ols('confirmed_log ~ ' + ' + '.join(vars_to_use), data = df_newest[df_newest != 'Tanzania']).fit()
season_vars['cumulative'] = results_total_no_continent
keys_list = list(season_vars.keys())
keys_list.insert(0, keys_list.pop())
season_vars = dict(sorted(season_vars.items(), key = lambda pair: keys_list.index(pair[0])))
rs_list = list(season_vars.values())
result_summ = summary_col(rs_list, stars=True)
table_nocont = result_summ.tables[0]
table_nocont.columns = ['cumulative', 'winter_1', 'spring', 'summer', 'autumn', 'winter_2']
with open('no_continent_reg.tex','w') as tf:
tf.write(table_nocont.to_latex())
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'individualism_gov', 'cases_neighbors']
for season in df.season.unique():
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[(df_seasons.season == season) & (df_seasons.index != 'Tanzania')]).fit()
print(f"\n\n------------------- Season: {season} -----------------------")
print(results.summary())
------------------- Season: winter_1 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.397
Model: OLS Adj. R-squared: 0.308
Method: Least Squares F-statistic: 4.468
Date: Thu, 07 Jan 2021 Prob (F-statistic): 0.000159
Time: 18:59:03 Log-Likelihood: -245.75
No. Observations: 71 AIC: 511.5
Df Residuals: 61 BIC: 534.1
Df Model: 9
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 39.5698 22.780 1.737 0.087 -5.981 85.121
individualism_index 0.0147 0.093 0.157 0.876 -0.172 0.201
gdp_per_capita 7.487e-05 6.6e-05 1.134 0.261 -5.71e-05 0.000
population_density 0.0005 0.001 0.406 0.686 -0.002 0.003
mobility_index 0.0419 0.242 0.173 0.863 -0.441 0.525
containment_health_index -0.2079 0.285 -0.729 0.469 -0.778 0.362
democracy_index 0.0427 0.081 0.526 0.601 -0.120 0.205
distance_log -6.6397 2.374 -2.797 0.007 -11.387 -1.893
individualism_gov 0.0081 0.006 1.425 0.159 -0.003 0.019
cases_neighbors 2.2339 3.683 0.607 0.546 -5.131 9.599
==============================================================================
Omnibus: 3.643 Durbin-Watson: 2.563
Prob(Omnibus): 0.162 Jarque-Bera (JB): 1.893
Skew: 0.033 Prob(JB): 0.388
Kurtosis: 2.203 Cond. No. 7.95e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.95e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: spring -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.539
Model: OLS Adj. R-squared: 0.471
Method: Least Squares F-statistic: 7.937
Date: Thu, 07 Jan 2021 Prob (F-statistic): 1.18e-07
Time: 18:59:03 Log-Likelihood: -103.59
No. Observations: 71 AIC: 227.2
Df Residuals: 61 BIC: 249.8
Df Model: 9
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -2.0597 3.226 -0.638 0.526 -8.511 4.392
individualism_index -0.0144 0.051 -0.281 0.780 -0.117 0.088
gdp_per_capita 2.562e-05 9.55e-06 2.684 0.009 6.53e-06 4.47e-05
population_density 0.0003 0.000 1.835 0.071 -2.62e-05 0.001
mobility_index -0.0121 0.015 -0.833 0.408 -0.041 0.017
containment_health_index -0.0328 0.036 -0.910 0.366 -0.105 0.039
democracy_index -0.0062 0.011 -0.584 0.561 -0.027 0.015
distance_log 0.7522 0.274 2.742 0.008 0.204 1.301
individualism_gov 0.0003 0.001 0.372 0.711 -0.001 0.002
cases_neighbors 0.0097 0.003 2.780 0.007 0.003 0.017
==============================================================================
Omnibus: 0.796 Durbin-Watson: 1.153
Prob(Omnibus): 0.672 Jarque-Bera (JB): 0.908
Skew: -0.208 Prob(JB): 0.635
Kurtosis: 2.634 Cond. No. 8.31e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.31e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: summer -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.412
Model: OLS Adj. R-squared: 0.326
Method: Least Squares F-statistic: 4.756
Date: Thu, 07 Jan 2021 Prob (F-statistic): 8.30e-05
Time: 18:59:03 Log-Likelihood: -120.12
No. Observations: 71 AIC: 260.2
Df Residuals: 61 BIC: 282.9
Df Model: 9
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -7.7390 3.627 -2.133 0.037 -14.992 -0.485
individualism_index -0.0444 0.046 -0.968 0.337 -0.136 0.047
gdp_per_capita 1.818e-05 1.15e-05 1.583 0.119 -4.79e-06 4.11e-05
population_density 0.0002 0.000 1.123 0.266 -0.000 0.001
mobility_index 0.0015 0.010 0.148 0.883 -0.019 0.022
containment_health_index 0.0258 0.036 0.722 0.473 -0.046 0.097
democracy_index -0.0032 0.013 -0.237 0.813 -0.030 0.024
distance_log 1.2309 0.360 3.422 0.001 0.512 1.950
individualism_gov 0.0006 0.001 0.848 0.400 -0.001 0.002
cases_neighbors 0.0015 0.002 0.822 0.414 -0.002 0.005
==============================================================================
Omnibus: 7.740 Durbin-Watson: 1.691
Prob(Omnibus): 0.021 Jarque-Bera (JB): 7.137
Skew: -0.737 Prob(JB): 0.0282
Kurtosis: 3.490 Cond. No. 7.41e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.41e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: autumn -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.576
Model: OLS Adj. R-squared: 0.513
Method: Least Squares F-statistic: 9.196
Date: Thu, 07 Jan 2021 Prob (F-statistic): 1.20e-08
Time: 18:59:03 Log-Likelihood: -121.67
No. Observations: 71 AIC: 263.3
Df Residuals: 61 BIC: 286.0
Df Model: 9
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -6.7117 3.740 -1.795 0.078 -14.190 0.766
individualism_index 0.0600 0.049 1.216 0.229 -0.039 0.159
gdp_per_capita -6.628e-06 1.18e-05 -0.562 0.576 -3.02e-05 1.7e-05
population_density -6.633e-05 0.000 -0.329 0.743 -0.000 0.000
mobility_index -0.0130 0.018 -0.712 0.479 -0.049 0.023
containment_health_index 0.0554 0.042 1.312 0.194 -0.029 0.140
democracy_index 0.0128 0.014 0.905 0.369 -0.015 0.041
distance_log 0.7360 0.345 2.132 0.037 0.046 1.426
individualism_gov -0.0008 0.001 -0.994 0.324 -0.003 0.001
cases_neighbors 0.0039 0.001 6.298 0.000 0.003 0.005
==============================================================================
Omnibus: 10.455 Durbin-Watson: 1.553
Prob(Omnibus): 0.005 Jarque-Bera (JB): 10.463
Skew: -0.900 Prob(JB): 0.00535
Kurtosis: 3.544 Cond. No. 7.48e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.48e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: winter_2 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.590
Model: OLS Adj. R-squared: 0.530
Method: Least Squares F-statistic: 9.766
Date: Thu, 07 Jan 2021 Prob (F-statistic): 4.46e-09
Time: 18:59:03 Log-Likelihood: -118.09
No. Observations: 71 AIC: 256.2
Df Residuals: 61 BIC: 278.8
Df Model: 9
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -3.4210 3.697 -0.925 0.358 -10.814 3.972
individualism_index -0.0293 0.047 -0.624 0.535 -0.123 0.065
gdp_per_capita -3.149e-06 1.1e-05 -0.286 0.776 -2.52e-05 1.89e-05
population_density -0.0002 0.000 -1.037 0.304 -0.001 0.000
mobility_index -0.0490 0.020 -2.468 0.016 -0.089 -0.009
containment_health_index -0.0090 0.040 -0.224 0.823 -0.089 0.071
democracy_index 0.0136 0.014 1.006 0.318 -0.013 0.041
distance_log 0.7082 0.310 2.284 0.026 0.088 1.328
individualism_gov 0.0006 0.001 0.797 0.429 -0.001 0.002
cases_neighbors 0.0015 0.000 3.787 0.000 0.001 0.002
==============================================================================
Omnibus: 16.188 Durbin-Watson: 1.626
Prob(Omnibus): 0.000 Jarque-Bera (JB): 19.847
Skew: -1.018 Prob(JB): 4.90e-05
Kurtosis: 4.600 Cond. No. 7.78e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.78e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov', 'cases_neighbors']
for season in df.season.unique():
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[(df_seasons.season == season) & (df_seasons.index != 'Tanzania')]).fit()
print(f"\n\n------------------- Season: {season} -----------------------")
print(results.summary())
------------------- Season: winter_1 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.409
Model: OLS Adj. R-squared: 0.262
Method: Least Squares F-statistic: 2.772
Date: Thu, 07 Jan 2021 Prob (F-statistic): 0.00347
Time: 14:02:06 Log-Likelihood: -245.04
No. Observations: 71 AIC: 520.1
Df Residuals: 56 BIC: 554.0
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 78.6516 44.384 1.772 0.082 -10.261 167.564
continent[T.AS] -5.8156 6.202 -0.938 0.352 -18.240 6.609
continent[T.EU] -2.8400 4.616 -0.615 0.541 -12.088 6.408
continent[T.N.A.] 0.7481 4.892 0.153 0.879 -9.051 10.547
continent[T.OC] -2.2508 8.347 -0.270 0.788 -18.973 14.471
continent[T.SA] 2.1283 5.855 0.363 0.718 -9.601 13.858
individualism_index 0.0179 0.099 0.181 0.857 -0.181 0.217
gdp_per_capita 9.956e-05 7.35e-05 1.355 0.181 -4.76e-05 0.000
population_density 0.0005 0.001 0.439 0.662 -0.002 0.003
mobility_index 0.1595 0.284 0.561 0.577 -0.410 0.729
containment_health_index -0.2044 0.304 -0.672 0.504 -0.814 0.405
democracy_index 0.0152 0.095 0.160 0.874 -0.175 0.205
distance_log -10.6357 4.591 -2.317 0.024 -19.832 -1.439
individualism_gov 0.0082 0.006 1.340 0.185 -0.004 0.021
cases_neighbors 1.5346 3.945 0.389 0.699 -6.368 9.437
==============================================================================
Omnibus: 3.442 Durbin-Watson: 2.625
Prob(Omnibus): 0.179 Jarque-Bera (JB): 1.828
Skew: 0.013 Prob(JB): 0.401
Kurtosis: 2.214 Cond. No. 1.51e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.51e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: spring -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.651
Model: OLS Adj. R-squared: 0.564
Method: Least Squares F-statistic: 7.462
Date: Thu, 07 Jan 2021 Prob (F-statistic): 1.81e-08
Time: 14:02:06 Log-Likelihood: -93.742
No. Observations: 71 AIC: 217.5
Df Residuals: 56 BIC: 251.4
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -4.9634 5.072 -0.979 0.332 -15.124 5.197
continent[T.AS] 1.2910 0.724 1.783 0.080 -0.160 2.742
continent[T.EU] 1.8514 0.520 3.564 0.001 0.811 2.892
continent[T.N.A.] 1.0195 0.574 1.777 0.081 -0.130 2.169
continent[T.OC] -0.3414 0.960 -0.356 0.723 -2.264 1.582
continent[T.SA] 1.5059 0.685 2.198 0.032 0.133 2.879
individualism_index -0.0083 0.048 -0.173 0.863 -0.104 0.088
gdp_per_capita 2.468e-05 8.95e-06 2.757 0.008 6.75e-06 4.26e-05
population_density 0.0003 0.000 2.043 0.046 5.84e-06 0.001
mobility_index -0.0208 0.014 -1.440 0.155 -0.050 0.008
containment_health_index -0.0345 0.034 -1.017 0.313 -0.102 0.033
democracy_index -0.0141 0.011 -1.321 0.192 -0.036 0.007
distance_log 0.9754 0.493 1.979 0.053 -0.012 1.963
individualism_gov 0.0003 0.001 0.423 0.674 -0.001 0.002
cases_neighbors 0.0027 0.004 0.727 0.470 -0.005 0.010
==============================================================================
Omnibus: 2.093 Durbin-Watson: 1.339
Prob(Omnibus): 0.351 Jarque-Bera (JB): 1.770
Skew: -0.387 Prob(JB): 0.413
Kurtosis: 2.983 Cond. No. 1.45e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.45e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: summer -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.523
Model: OLS Adj. R-squared: 0.404
Method: Least Squares F-statistic: 4.388
Date: Thu, 07 Jan 2021 Prob (F-statistic): 3.33e-05
Time: 14:02:07 Log-Likelihood: -112.70
No. Observations: 71 AIC: 255.4
Df Residuals: 56 BIC: 289.3
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -12.4994 5.966 -2.095 0.041 -24.451 -0.548
continent[T.AS] 1.5181 0.923 1.644 0.106 -0.331 3.367
continent[T.EU] 2.1529 0.707 3.044 0.004 0.736 3.570
continent[T.N.A.] 0.7208 0.834 0.864 0.391 -0.950 2.392
continent[T.OC] -0.4823 1.242 -0.388 0.699 -2.971 2.006
continent[T.SA] 0.5209 1.184 0.440 0.662 -1.852 2.894
individualism_index -0.0478 0.044 -1.079 0.285 -0.137 0.041
gdp_per_capita 1.235e-05 1.11e-05 1.117 0.269 -9.81e-06 3.45e-05
population_density 0.0002 0.000 1.041 0.302 -0.000 0.001
mobility_index -0.0193 0.012 -1.568 0.122 -0.044 0.005
containment_health_index 0.0185 0.034 0.547 0.587 -0.049 0.086
democracy_index -0.0050 0.014 -0.362 0.719 -0.033 0.023
distance_log 1.6458 0.602 2.736 0.008 0.441 2.851
individualism_gov 0.0008 0.001 1.119 0.268 -0.001 0.002
cases_neighbors 0.0009 0.002 0.368 0.714 -0.004 0.006
==============================================================================
Omnibus: 5.196 Durbin-Watson: 2.134
Prob(Omnibus): 0.074 Jarque-Bera (JB): 4.555
Skew: -0.609 Prob(JB): 0.103
Kurtosis: 3.235 Cond. No. 1.31e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: autumn -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.793
Model: OLS Adj. R-squared: 0.741
Method: Least Squares F-statistic: 15.31
Date: Thu, 07 Jan 2021 Prob (F-statistic): 2.53e-14
Time: 14:02:07 Log-Likelihood: -96.215
No. Observations: 71 AIC: 222.4
Df Residuals: 56 BIC: 256.4
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -22.0467 4.731 -4.660 0.000 -31.524 -12.569
continent[T.AS] 3.2832 0.756 4.345 0.000 1.770 4.797
continent[T.EU] 3.4457 0.631 5.463 0.000 2.182 4.709
continent[T.N.A.] 1.0693 0.617 1.734 0.088 -0.166 2.304
continent[T.OC] -2.2546 0.970 -2.324 0.024 -4.198 -0.311
continent[T.SA] 0.6305 0.728 0.866 0.390 -0.828 2.089
individualism_index 0.0417 0.037 1.128 0.264 -0.032 0.116
gdp_per_capita -1.264e-05 8.74e-06 -1.447 0.153 -3.01e-05 4.86e-06
population_density -7.432e-05 0.000 -0.504 0.616 -0.000 0.000
mobility_index -0.0211 0.015 -1.452 0.152 -0.050 0.008
containment_health_index 0.0379 0.031 1.204 0.234 -0.025 0.101
democracy_index 0.0145 0.011 1.296 0.200 -0.008 0.037
distance_log 2.3798 0.489 4.865 0.000 1.400 3.360
individualism_gov -0.0004 0.001 -0.656 0.515 -0.002 0.001
cases_neighbors 0.0012 0.001 1.914 0.061 -5.57e-05 0.002
==============================================================================
Omnibus: 14.665 Durbin-Watson: 1.806
Prob(Omnibus): 0.001 Jarque-Bera (JB): 18.577
Skew: -0.890 Prob(JB): 9.25e-05
Kurtosis: 4.764 Cond. No. 1.31e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: winter_2 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.778
Model: OLS Adj. R-squared: 0.723
Method: Least Squares F-statistic: 14.05
Date: Thu, 07 Jan 2021 Prob (F-statistic): 1.52e-13
Time: 14:02:07 Log-Likelihood: -96.287
No. Observations: 71 AIC: 222.6
Df Residuals: 56 BIC: 256.5
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -17.4795 4.771 -3.663 0.001 -27.038 -7.921
continent[T.AS] 2.2654 0.764 2.967 0.004 0.736 3.795
continent[T.EU] 2.0299 0.706 2.876 0.006 0.616 3.444
continent[T.N.A.] 0.2587 0.649 0.399 0.692 -1.042 1.559
continent[T.OC] -4.3276 1.050 -4.120 0.000 -6.432 -2.223
continent[T.SA] 0.1582 0.731 0.216 0.829 -1.307 1.623
individualism_index 0.0386 0.040 0.970 0.336 -0.041 0.118
gdp_per_capita -1.062e-05 8.77e-06 -1.212 0.231 -2.82e-05 6.94e-06
population_density -0.0002 0.000 -1.344 0.184 -0.000 9.7e-05
mobility_index -0.0367 0.016 -2.279 0.026 -0.069 -0.004
containment_health_index 0.0058 0.032 0.183 0.855 -0.058 0.069
democracy_index 0.0216 0.011 1.893 0.064 -0.001 0.044
distance_log 2.0404 0.470 4.344 0.000 1.099 2.981
individualism_gov -0.0003 0.001 -0.535 0.595 -0.002 0.001
cases_neighbors 0.0005 0.000 1.237 0.221 -0.000 0.001
==============================================================================
Omnibus: 9.005 Durbin-Watson: 1.808
Prob(Omnibus): 0.011 Jarque-Bera (JB): 8.893
Skew: -0.693 Prob(JB): 0.0117
Kurtosis: 4.042 Cond. No. 1.32e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.32e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'cases_neighbors']
for season in df.season.unique():
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons.season == season]).fit()
print(f"\n\n------------------- Season: {season} -----------------------")
print(results.summary())
------------------- Season: winter_1 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.397
Model: OLS Adj. R-squared: 0.262
Method: Least Squares F-statistic: 2.936
Date: Wed, 06 Jan 2021 Prob (F-statistic): 0.00246
Time: 11:18:18 Log-Likelihood: -249.27
No. Observations: 72 AIC: 526.5
Df Residuals: 58 BIC: 558.4
Df Model: 13
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 63.2668 43.053 1.470 0.147 -22.912 149.446
continent[T.AS] -5.0794 6.073 -0.836 0.406 -17.236 7.077
continent[T.EU] -2.2781 4.480 -0.508 0.613 -11.247 6.691
continent[T.N.A.] 0.9213 4.764 0.193 0.847 -8.615 10.458
continent[T.OC] 1.0310 8.023 0.129 0.898 -15.028 17.090
continent[T.SA] 2.5496 5.750 0.443 0.659 -8.960 14.060
individualism_index 0.1126 0.071 1.592 0.117 -0.029 0.254
gdp_per_capita 9.285e-05 7.33e-05 1.266 0.210 -5.39e-05 0.000
population_density 0.0004 0.001 0.315 0.754 -0.002 0.003
mobility_index 0.2440 0.278 0.878 0.384 -0.312 0.800
containment_health_index 0.1483 0.156 0.952 0.345 -0.163 0.460
democracy_index 0.0052 0.095 0.055 0.956 -0.184 0.195
distance_log -9.3564 4.501 -2.079 0.042 -18.367 -0.346
cases_neighbors 1.4852 3.945 0.376 0.708 -6.412 9.382
==============================================================================
Omnibus: 3.147 Durbin-Watson: 2.642
Prob(Omnibus): 0.207 Jarque-Bera (JB): 1.819
Skew: -0.099 Prob(JB): 0.403
Kurtosis: 2.247 Cond. No. 1.46e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.46e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: spring -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.656
Model: OLS Adj. R-squared: 0.579
Method: Least Squares F-statistic: 8.504
Date: Wed, 06 Jan 2021 Prob (F-statistic): 2.72e-09
Time: 11:18:18 Log-Likelihood: -97.796
No. Observations: 72 AIC: 223.6
Df Residuals: 58 BIC: 255.5
Df Model: 13
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -6.9110 4.850 -1.425 0.160 -16.619 2.797
continent[T.AS] 1.5341 0.733 2.094 0.041 0.067 3.001
continent[T.EU] 2.1371 0.515 4.146 0.000 1.105 3.169
continent[T.N.A.] 1.2376 0.562 2.201 0.032 0.112 2.363
continent[T.OC] -0.1224 0.975 -0.126 0.901 -2.074 1.829
continent[T.SA] 1.7354 0.682 2.546 0.014 0.371 3.100
individualism_index 0.0121 0.009 1.317 0.193 -0.006 0.030
gdp_per_capita 2.506e-05 9.08e-06 2.760 0.008 6.89e-06 4.32e-05
population_density 0.0003 0.000 1.975 0.053 -4e-06 0.001
mobility_index -0.0205 0.014 -1.431 0.158 -0.049 0.008
containment_health_index -0.0102 0.018 -0.554 0.582 -0.047 0.027
democracy_index -0.0148 0.011 -1.349 0.183 -0.037 0.007
distance_log 1.0110 0.506 1.997 0.051 -0.002 2.024
cases_neighbors 0.0024 0.004 0.628 0.532 -0.005 0.010
==============================================================================
Omnibus: 2.083 Durbin-Watson: 1.427
Prob(Omnibus): 0.353 Jarque-Bera (JB): 1.648
Skew: -0.369 Prob(JB): 0.439
Kurtosis: 3.073 Cond. No. 1.35e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.35e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: summer -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.417
Model: OLS Adj. R-squared: 0.286
Method: Least Squares F-statistic: 3.191
Date: Wed, 06 Jan 2021 Prob (F-statistic): 0.00117
Time: 11:18:18 Log-Likelihood: -176.49
No. Observations: 72 AIC: 381.0
Df Residuals: 58 BIC: 412.8
Df Model: 13
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -24.0972 13.329 -1.808 0.076 -50.778 2.584
continent[T.AS] 4.2484 2.128 1.996 0.051 -0.011 8.508
continent[T.EU] 5.3821 1.571 3.427 0.001 2.238 8.526
continent[T.N.A.] 2.4649 1.927 1.279 0.206 -1.392 6.322
continent[T.OC] 2.8367 2.837 1.000 0.322 -2.843 8.516
continent[T.SA] 1.7484 2.770 0.631 0.530 -3.796 7.293
individualism_index -0.0010 0.027 -0.036 0.971 -0.056 0.054
gdp_per_capita 1.506e-05 2.57e-05 0.585 0.561 -3.64e-05 6.66e-05
population_density 0.0002 0.000 0.491 0.625 -0.001 0.001
mobility_index -0.0076 0.029 -0.264 0.792 -0.065 0.050
containment_health_index 0.1521 0.039 3.941 0.000 0.075 0.229
democracy_index -0.0013 0.032 -0.040 0.968 -0.066 0.064
distance_log 1.7314 1.412 1.226 0.225 -1.095 4.557
cases_neighbors 0.0024 0.006 0.427 0.671 -0.009 0.013
==============================================================================
Omnibus: 95.371 Durbin-Watson: 1.907
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2034.019
Skew: -3.865 Prob(JB): 0.00
Kurtosis: 27.865 Cond. No. 1.24e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.24e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: autumn -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.549
Model: OLS Adj. R-squared: 0.447
Method: Least Squares F-statistic: 5.422
Date: Wed, 06 Jan 2021 Prob (F-statistic): 2.83e-06
Time: 11:18:18 Log-Likelihood: -172.75
No. Observations: 72 AIC: 373.5
Df Residuals: 58 BIC: 405.4
Df Model: 13
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -31.4920 12.889 -2.443 0.018 -57.293 -5.691
continent[T.AS] 5.5655 2.081 2.674 0.010 1.399 9.732
continent[T.EU] 6.7605 1.685 4.012 0.000 3.387 10.134
continent[T.N.A.] 3.8513 1.684 2.287 0.026 0.480 7.223
continent[T.OC] 1.7736 2.648 0.670 0.506 -3.527 7.074
continent[T.SA] 3.2303 2.004 1.612 0.112 -0.781 7.241
individualism_index 0.0101 0.025 0.403 0.688 -0.040 0.060
gdp_per_capita -1.136e-05 2.45e-05 -0.463 0.645 -6.04e-05 3.77e-05
population_density -4.644e-05 0.000 -0.112 0.911 -0.001 0.001
mobility_index 0.0430 0.040 1.082 0.284 -0.037 0.123
containment_health_index 0.1662 0.040 4.105 0.000 0.085 0.247
democracy_index 0.0100 0.031 0.323 0.748 -0.052 0.072
distance_log 2.5065 1.371 1.828 0.073 -0.238 5.250
cases_neighbors 0.0005 0.002 0.270 0.788 -0.003 0.004
==============================================================================
Omnibus: 93.753 Durbin-Watson: 1.908
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1879.921
Skew: -3.792 Prob(JB): 0.00
Kurtosis: 26.856 Cond. No. 1.27e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.27e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
------------------- Season: winter_2 -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.493
Model: OLS Adj. R-squared: 0.380
Method: Least Squares F-statistic: 4.346
Date: Wed, 06 Jan 2021 Prob (F-statistic): 4.66e-05
Time: 11:18:18 Log-Likelihood: -175.08
No. Observations: 72 AIC: 378.2
Df Residuals: 58 BIC: 410.0
Df Model: 13
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -28.1049 12.955 -2.169 0.034 -54.037 -2.173
continent[T.AS] 4.5537 2.186 2.083 0.042 0.178 8.929
continent[T.EU] 4.9487 1.995 2.481 0.016 0.955 8.942
continent[T.N.A.] 3.0232 1.843 1.641 0.106 -0.665 6.712
continent[T.OC] 0.9615 2.760 0.348 0.729 -4.563 6.486
continent[T.SA] 2.5182 2.090 1.205 0.233 -1.665 6.702
individualism_index 0.0107 0.026 0.416 0.679 -0.041 0.062
gdp_per_capita -9.27e-06 2.5e-05 -0.372 0.712 -5.92e-05 4.07e-05
population_density -0.0002 0.000 -0.446 0.657 -0.001 0.001
mobility_index 0.0358 0.045 0.798 0.428 -0.054 0.126
containment_health_index 0.1430 0.041 3.487 0.001 0.061 0.225
democracy_index 0.0044 0.032 0.137 0.892 -0.060 0.069
distance_log 2.2735 1.361 1.670 0.100 -0.451 4.998
cases_neighbors 0.0009 0.001 0.799 0.428 -0.001 0.003
==============================================================================
Omnibus: 94.816 Durbin-Watson: 1.960
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1903.211
Skew: -3.866 Prob(JB): 0.00
Kurtosis: 26.971 Cond. No. 1.24e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.24e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
import seaborn as sns
sns.heatmap(df_seasons[vars_to_use].corr(), annot=True)
<AxesSubplot:>
import statsmodels.api as sm
def get_plot(results_total, df_newest, savefig = False):
fitted_vals = results_total.predict()
resids = results_total.resid
y = df_newest['confirmed_log']
fig, ax = plt.subplots(1,2, figsize = (12,6))
sns.regplot(x=fitted_vals, y=y, lowess=True, ax=ax[0], line_kws={'color': 'grey'})
ax[0].set_title('Observed vs. Predicted Values', fontsize=14)
ax[0].set(xlabel='Predicted', ylabel='Observed')
sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[1], line_kws={'color': 'grey'})
ax[1].set_title('Residuals vs. Predicted Values', fontsize=14)
ax[1].set(xlabel='Predicted', ylabel='Residuals')
if savefig:
fig.savefig('residuals_cumulative.png')
aut = df_seasons[df_seasons.season == 'autumn']
aut[season_vars['autumn'].resid < -15]
| country | season | continent | individualism_index | health_expenditure | gdp_per_capita | over65_per_capita | population_density | mobility_index | containment_health_index | democracy_index | distance_log | individualism_gov | cases_neighbors | new_cases_per_100k | new_cases_per_100k_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 310 | Tanzania | autumn | AF | 25.0 | 3.645451 | 985.448173 | 0.026205 | 63.579079 | -11.187826 | 17.363541 | 51.6 | 9.136094 | 434.08853 | 10.597085 | 0.0 | -23.025851 |
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov', 'cases_neighbors']
season = 'autumn'
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons.season == season]).fit()
print(f"\n\n------------------- Season: {season} -----------------------")
print(results.summary())
------------------- Season: autumn -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.595
Model: OLS Adj. R-squared: 0.495
Method: Least Squares F-statistic: 5.977
Date: Thu, 07 Jan 2021 Prob (F-statistic): 4.97e-07
Time: 07:46:50 Log-Likelihood: -168.86
No. Observations: 72 AIC: 367.7
Df Residuals: 57 BIC: 401.9
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -37.2632 12.524 -2.975 0.004 -62.343 -12.183
continent[T.AS] 4.6602 2.021 2.306 0.025 0.614 8.706
continent[T.EU] 5.6948 1.664 3.423 0.001 2.363 9.027
continent[T.N.A.] 3.2374 1.627 1.989 0.051 -0.022 6.496
continent[T.OC] 0.5475 2.576 0.213 0.832 -4.611 5.706
continent[T.SA] 2.4500 1.939 1.263 0.212 -1.433 6.333
individualism_index 0.2448 0.095 2.574 0.013 0.054 0.435
gdp_per_capita -8.304e-06 2.35e-05 -0.354 0.725 -5.53e-05 3.87e-05
population_density -4.507e-05 0.000 -0.114 0.910 -0.001 0.001
mobility_index 0.0389 0.038 1.024 0.310 -0.037 0.115
containment_health_index 0.3267 0.074 4.422 0.000 0.179 0.475
democracy_index -0.0029 0.030 -0.096 0.924 -0.063 0.057
distance_log 2.2470 1.314 1.710 0.093 -0.384 4.878
individualism_gov -0.0041 0.002 -2.550 0.013 -0.007 -0.001
cases_neighbors 0.0014 0.002 0.819 0.416 -0.002 0.005
==============================================================================
Omnibus: 79.610 Durbin-Watson: 1.974
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1148.897
Skew: -3.064 Prob(JB): 3.31e-250
Kurtosis: 21.586 Cond. No. 1.29e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.29e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
df_seasons[(df_seasons['new_cases_per_100k'] == 0) & (df_seasons['season'] != 'winter_1')]
| country | season | continent | individualism_index | health_expenditure | gdp_per_capita | over65_per_capita | population_density | mobility_index | containment_health_index | democracy_index | distance_log | individualism_gov | cases_neighbors | new_cases_per_100k | new_cases_per_100k_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 310 | Tanzania | autumn | AF | 25.0 | 3.645451 | 985.448173 | 0.026205 | 63.579079 | -11.187826 | 17.363541 | 51.6 | 9.136094 | 434.088530 | 10.597085 | 0.0 | -23.025851 |
| 312 | Tanzania | summer | AF | 25.0 | 3.645451 | 985.448173 | 0.026205 | 63.579079 | -4.145647 | 26.077405 | 51.6 | 9.136094 | 651.935125 | 9.552368 | 0.0 | -23.025851 |
| 314 | Tanzania | winter_2 | AF | 25.0 | 3.645451 | 985.448173 | 0.026205 | 63.579079 | -2.186495 | 14.700476 | 51.6 | 9.136094 | 367.511905 | 14.925671 | 0.0 | -23.025851 |
df_autumn = df_seasons[df_seasons.season == 'autumn']
def plot_values(results, df):
fitted_vals = results.predict()
resids = results.resid
y = df['new_cases_per_100k_log']
fig, ax = plt.subplots(1,2, figsize = (12,6))
sns.regplot(x=fitted_vals, y=y, lowess=True, ax=ax[0], line_kws={'color': 'grey'})
ax[0].set_title('Observed vs. Predicted Values', fontsize=14)
ax[0].set(xlabel='Predicted', ylabel='Observed')
sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[1], line_kws={'color': 'grey'})
ax[1].set_title('Residuals vs. Predicted Values', fontsize=14)
ax[1].set(xlabel='Predicted', ylabel='Residuals')
plot_values(results, df_seasons[df_seasons.season == 'autumn'])
Running the model without Tanzania
df_autumn = df_autumn[df_autumn.country != 'Tanzania']
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_autumn).fit()
print(f"\n\n------------------- Season: {season} -----------------------")
print(results.summary())
------------------- Season: autumn -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.793
Model: OLS Adj. R-squared: 0.741
Method: Least Squares F-statistic: 15.31
Date: Thu, 07 Jan 2021 Prob (F-statistic): 2.53e-14
Time: 07:46:58 Log-Likelihood: -96.215
No. Observations: 71 AIC: 222.4
Df Residuals: 56 BIC: 256.4
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -22.0467 4.731 -4.660 0.000 -31.524 -12.569
continent[T.AS] 3.2832 0.756 4.345 0.000 1.770 4.797
continent[T.EU] 3.4457 0.631 5.463 0.000 2.182 4.709
continent[T.N.A.] 1.0693 0.617 1.734 0.088 -0.166 2.304
continent[T.OC] -2.2546 0.970 -2.324 0.024 -4.198 -0.311
continent[T.SA] 0.6305 0.728 0.866 0.390 -0.828 2.089
individualism_index 0.0417 0.037 1.128 0.264 -0.032 0.116
gdp_per_capita -1.264e-05 8.74e-06 -1.447 0.153 -3.01e-05 4.86e-06
population_density -7.432e-05 0.000 -0.504 0.616 -0.000 0.000
mobility_index -0.0211 0.015 -1.452 0.152 -0.050 0.008
containment_health_index 0.0379 0.031 1.204 0.234 -0.025 0.101
democracy_index 0.0145 0.011 1.296 0.200 -0.008 0.037
distance_log 2.3798 0.489 4.865 0.000 1.400 3.360
individualism_gov -0.0004 0.001 -0.656 0.515 -0.002 0.001
cases_neighbors 0.0012 0.001 1.914 0.061 -5.57e-05 0.002
==============================================================================
Omnibus: 14.665 Durbin-Watson: 1.806
Prob(Omnibus): 0.001 Jarque-Bera (JB): 18.577
Skew: -0.890 Prob(JB): 9.25e-05
Kurtosis: 4.764 Cond. No. 1.31e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
def plot_values(results, df):
fitted_vals = results.predict()
resids = results.resid
y = df['new_cases_per_100k_log']
fig, ax = plt.subplots(1,2, figsize = (12,6))
sns.regplot(x=fitted_vals, y=y, lowess=True, ax=ax[0], line_kws={'color': 'grey'})
ax[0].set_title('Observed vs. Predicted Values', fontsize=14)
ax[0].set(xlabel='Predicted', ylabel='Observed')
sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[1], line_kws={'color': 'grey'})
ax[1].set_title('Residuals vs. Predicted Values', fontsize=14)
ax[1].set(xlabel='Predicted', ylabel='Residuals')
plot_values(results, df_autumn)
sm.graphics.influence_plot(results, criterion="cooks");
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'continent', 'individualism_gov', 'cases_neighbors']
season = 'summer'
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_seasons[df_seasons.season == season]).fit()
print(f"\n\n------------------- Season: {season} -----------------------")
print(results.summary())
------------------- Season: summer -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.423
Model: OLS Adj. R-squared: 0.282
Method: Least Squares F-statistic: 2.991
Date: Thu, 07 Jan 2021 Prob (F-statistic): 0.00176
Time: 08:02:02 Log-Likelihood: -176.08
No. Observations: 72 AIC: 382.2
Df Residuals: 57 BIC: 416.3
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -27.1022 13.886 -1.952 0.056 -54.908 0.704
continent[T.AS] 4.1260 2.140 1.928 0.059 -0.160 8.411
continent[T.EU] 5.1252 1.608 3.188 0.002 1.906 8.345
continent[T.N.A.] 2.2478 1.952 1.152 0.254 -1.660 6.156
continent[T.OC] 2.4160 2.894 0.835 0.407 -3.379 8.211
continent[T.SA] 1.6003 2.785 0.575 0.568 -3.976 7.176
individualism_index 0.0784 0.103 0.763 0.449 -0.127 0.284
gdp_per_capita 1.223e-05 2.6e-05 0.469 0.641 -3.99e-05 6.44e-05
population_density 0.0002 0.000 0.539 0.592 -0.001 0.001
mobility_index -0.0060 0.029 -0.206 0.837 -0.064 0.052
containment_health_index 0.2036 0.075 2.713 0.009 0.053 0.354
democracy_index -0.0033 0.033 -0.102 0.919 -0.069 0.062
distance_log 1.7596 1.417 1.242 0.219 -1.077 4.596
individualism_gov -0.0013 0.002 -0.802 0.426 -0.005 0.002
cases_neighbors 0.0019 0.006 0.346 0.731 -0.009 0.013
==============================================================================
Omnibus: 91.401 Durbin-Watson: 1.897
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1778.080
Skew: -3.649 Prob(JB): 0.00
Kurtosis: 26.226 Cond. No. 1.29e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.29e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
df_summer = df_seasons[df_seasons.season == 'summer']
def plot_values(results, df):
fitted_vals = results.predict()
resids = results.resid
y = df['new_cases_per_100k_log']
fig, ax = plt.subplots(1,2, figsize = (12,6))
sns.regplot(x=fitted_vals, y=y, lowess=True, ax=ax[0], line_kws={'color': 'grey'})
ax[0].set_title('Observed vs. Predicted Values', fontsize=14)
ax[0].set(xlabel='Predicted', ylabel='Observed')
sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[1], line_kws={'color': 'grey'})
ax[1].set_title('Residuals vs. Predicted Values', fontsize=14)
ax[1].set(xlabel='Predicted', ylabel='Residuals')
plot_values(results, df_seasons[df_seasons.season == 'summer'])
Running the model without Tanzania
df_summer = df_summer[df_summer.country != 'Tanzania']
results = smf.ols('new_cases_per_100k_log ~ ' + ' + '.join(vars_to_use), data = df_summer).fit()
print(f"\n\n------------------- Season: {season} -----------------------")
print(results.summary())
------------------- Season: summer -----------------------
OLS Regression Results
==================================================================================
Dep. Variable: new_cases_per_100k_log R-squared: 0.523
Model: OLS Adj. R-squared: 0.404
Method: Least Squares F-statistic: 4.388
Date: Thu, 07 Jan 2021 Prob (F-statistic): 3.33e-05
Time: 08:02:14 Log-Likelihood: -112.70
No. Observations: 71 AIC: 255.4
Df Residuals: 56 BIC: 289.3
Df Model: 14
Covariance Type: nonrobust
============================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -12.4994 5.966 -2.095 0.041 -24.451 -0.548
continent[T.AS] 1.5181 0.923 1.644 0.106 -0.331 3.367
continent[T.EU] 2.1529 0.707 3.044 0.004 0.736 3.570
continent[T.N.A.] 0.7208 0.834 0.864 0.391 -0.950 2.392
continent[T.OC] -0.4823 1.242 -0.388 0.699 -2.971 2.006
continent[T.SA] 0.5209 1.184 0.440 0.662 -1.852 2.894
individualism_index -0.0478 0.044 -1.079 0.285 -0.137 0.041
gdp_per_capita 1.235e-05 1.11e-05 1.117 0.269 -9.81e-06 3.45e-05
population_density 0.0002 0.000 1.041 0.302 -0.000 0.001
mobility_index -0.0193 0.012 -1.568 0.122 -0.044 0.005
containment_health_index 0.0185 0.034 0.547 0.587 -0.049 0.086
democracy_index -0.0050 0.014 -0.362 0.719 -0.033 0.023
distance_log 1.6458 0.602 2.736 0.008 0.441 2.851
individualism_gov 0.0008 0.001 1.119 0.268 -0.001 0.002
cases_neighbors 0.0009 0.002 0.368 0.714 -0.004 0.006
==============================================================================
Omnibus: 5.196 Durbin-Watson: 2.134
Prob(Omnibus): 0.074 Jarque-Bera (JB): 4.555
Skew: -0.609 Prob(JB): 0.103
Kurtosis: 3.235 Cond. No. 1.31e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
plot_values(results, df_summer)
sm.graphics.influence_plot(results, criterion="cooks");
season_vars = get_season_vars(with_tanzania=False)
season_vars['cumulative'] = results_total2
keys_list = list(season_vars.keys())
keys_list.insert(0, keys_list.pop())
season_vars = dict(sorted(season_vars.items(), key = lambda pair: keys_list.index(pair[0])))
df_seasons2 = df_seasons[df_seasons.index != 'Tanzania'].copy()
season_vars.keys()
dict_keys(['cumulative', 'winter_1', 'spring', 'summer', 'autumn', 'winter_2'])
item_plots = {'cumulative': [0,1],
'winter_1': [2,3],
'spring': [4,5],
'summer': [6,7],
'autumn': [8,9],
'winter_2': [10,11]}
fig, axs = plt.subplots(3,4, figsize = (18,12))
ax_list = [item for sublist in axs for item in sublist]
for season, model in season_vars.items():
fitted_vals = model.predict()
resids = model.resid
if season == 'cumulative': y = df_newest['confirmed_log']
else: y = df_seasons2.loc[(df_seasons2.season == season, 'new_cases_per_100k_log') ]
# get items
plots = item_plots[season]
for plot in plots:
ax = ax_list.pop(0)
if plot % 2 == 0:
sns.regplot(x=fitted_vals, y=y, lowess=True, ax=ax, line_kws={'color': 'grey'})
ax.set_title(f'Model: {season}\n Observed vs. Predicted Values', fontsize=14)
ax.set(xlabel='Predicted', ylabel='Observed')
else:
sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax, line_kws={'color': 'grey'})
ax.set_title(f'Model: {season}\n Residuals vs. Predicted Values', fontsize=14)
ax.set(xlabel='Predicted', ylabel='Residuals')
plt.tight_layout()
fig.savefig('resids.png')
season_vars
{'cumulative': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1d3213c61f0>,
'winter_1': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1d32243d8b0>,
'spring': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1d3249ba7f0>,
'summer': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1d3249c9fa0>,
'autumn': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1d3249db340>,
'winter_2': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1d3249e9220>}
get_plot(results_total, df_newest, savefig=True)
from functools import reduce
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log', 'individualism_gov',
'cases_neighbors']
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(df[vars_to_use].values, i) for i in range(df[vars_to_use].shape[1])]
vif['variable'] = df[vars_to_use].columns
season_vars
{'cumulative': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299a6c4370>,
'winter_1': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c1ade80>,
'spring': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c2a6c70>,
'summer': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c5532b0>,
'autumn': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c56ff40>,
'winter_2': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c566f40>}
df2 = df.copy()
df2 = pd.concat((df2.drop('continent', axis=1), pd.get_dummies(df2['continent'])), axis=1)
df2 = df2.rename(columns={'N.A.': 'NA'})
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log',
'cases_neighbors', 'AF', 'AS', 'EU', 'NA', 'OC']
def join_strings(xs, sep):
return reduce(lambda a, b: a + sep + b, xs)
vifs = {}
for v in vars_to_use:
cns = vars_to_use.copy()
cns.remove(v)
formula = v + " ~ " + join_strings(cns, " + ")
coef_det = smf.ols(formula, data = df2).fit()
vif = 1 / (1 - coef_det.rsquared)
vifs[v] = vif
if vif > 3:
print("\n" + 80 * "=")
print(v)
print(vif)
multicolinearity_df = pd.DataFrame(vifs.items())
================================================================================ distance_log 4.529382769257378 ================================================================================ AS 9.840883109482503 ================================================================================ EU 6.892263022892055
with open('corr_coef.tex','w') as tf:
tf.write(df2[vars_to_use].corr().to_latex())
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log',
'cases_neighbors']
df_corr = df2[vars_to_use].corr()
df2_c = df2[vars_to_use]
df2_c.columns = ['ind', 'gdp' ,'popdens', 'mob', 'health', 'democr', 'dist', 'neighb']
df2_c.corr()
with open('corr_coef.tex','w') as tf:
tf.write(df2_c.corr().to_latex())
multicolinearity_df.columns = ['Variable', 'VIF']
with open('multicolinearity.tex','w') as tf:
tf.write(multicolinearity_df.to_latex())
import statsmodels.stats.api as sms
stder = {}
for season, fit in season_vars.items():
stder[season] = {}
names = ['Lagrange multiplier statistic', 'p-value',
'F-statistic', 'p-value for F-statistic']
for season, fit in season_vars.items():
test = sms.het_breuschpagan(fit.resid, fit.model.exog)
items = {*(zip(names,test))}
for it in items:
stder[season][it[0]] = it[1]
stder = pd.DataFrame(stder).round(4)
with open('breuschp.tex','w') as tf:
tf.write(stder.to_latex())
OLSResults.et_robustcov_results()
import statsmodels.tsa.api as smt
acf = smt.graphics.plot_acf(results_total.resid, lags=40 , alpha=0.05)
acf.show()
<ipython-input-86-5396dca5761b>:2: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure. acf.show()
season_vars
{'cumulative': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299a6c4370>,
'winter_1': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c1ade80>,
'spring': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c2a6c70>,
'summer': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c5532b0>,
'autumn': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c56ff40>,
'winter_2': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2299c566f40>}
fig, axs = plt.subplots(2,3, figsize = (18,8))
ax_list = [item for sublist in axs for item in sublist]
for season, model in season_vars.items():
ax = ax_list.pop(0)
acf = smt.graphics.plot_acf(model.resid, lags=10 , alpha=0.05, ax=ax)
ax.set_title(f"Model: {season}", fontsize = 16)
plt.tight_layout()
fig.savefig('autocorrelation.png')
There is no autocorrelation is the residuals
# Durbin Watson test
from statsmodels.stats.stattools import durbin_watson
durbs = {}
for season, model in season_vars.items():
durbs[season] = durbin_watson(model.resid)
durbs_df = pd.DataFrame(durbs.items())
durbs_df.columns=['Model', 'Durbin-Watson statistic']
with open('dw_test.tex','w') as tf:
tf.write(durbs_df.to_latex())
durbin_watson(results_total.resid)
1.6113160773978314
durbin_watson(results.resid)
2.1343740259697963
According to the sample size and number of regressors, the statistics should be in the range of 1.369 and 1.910 as presented by the DW statistical table. This means that the model is not suffering from serial correlation in the residuals.
df_newest
| country | month | idv | confirmed | dead | population | current_health_expenditure_of_gdp | gdp_per_capita_constant_2010_us | population_ages_65_and_above_total | population_density_people_per_sq_km_of_land_area | ... | containment_health_index | democracy_index | capital | distance | continent | confirmed_log | current_health_expenditure_of_gdp_log | distance_log | gdp_per_capita_constant_2010_us_log | season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10 | Argentina | 12.0 | 46.0 | 1602163.0 | 43018.0 | 44938712.0 | 9.124315 | 9729.141574 | 5052508.0 | 16.258510 | ... | 73.400000 | 70.2 | Buenos Aires | 19206.559549 | SA | 14.286865 | 2.210943 | 9.863007 | 9.182881 | winter_2 |
| 21 | Australia | 12.0 | 90.0 | 28381.0 | 909.0 | 25364307.0 | 9.205948 | 57071.168295 | 4038302.0 | 3.247871 | ... | 60.961935 | 90.9 | Canberra | 8153.589761 | OC | 10.253475 | 2.219850 | 9.006214 | 10.952054 | winter_2 |
| 32 | Austria | 12.0 | 55.0 | 356351.0 | 6086.0 | 8877067.0 | 10.396617 | 50654.730148 | 1693354.0 | 107.127967 | ... | 72.941579 | 82.9 | Vienna | 8081.565310 | EU | 12.783671 | 2.341480 | 8.997341 | 10.832788 | winter_2 |
| 43 | Bangladesh | 12.0 | 20.0 | 512496.0 | 7531.0 | 163046161.0 | 2.274249 | 1287.821425 | 8446364.0 | 1239.579312 | ... | 72.120000 | 58.8 | Dhaka | 2477.356735 | AS | 13.147048 | 0.821650 | 7.814947 | 7.160707 | winter_2 |
| 54 | Belarus | 12.0 | 25.0 | 192361.0 | 1414.0 | 9466856.0 | 5.925786 | 6678.509065 | 1439042.0 | 46.719504 | ... | 36.265714 | 24.8 | Minsk | 7119.018739 | EU | 12.167129 | 1.779313 | 8.870525 | 8.806650 | winter_2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 747 | United Kingdom | 12.0 | 89.0 | 2432892.0 | 72548.0 | 66834405.0 | 9.631694 | 43688.437455 | 12370177.0 | 274.708982 | ... | 66.425556 | 85.2 | London | 8894.845144 | EU | 14.704591 | 2.265059 | 9.093227 | 10.684839 | winter_2 |
| 758 | United States | 12.0 | 91.0 | 19346790.0 | 335789.0 | 328239523.0 | 17.061269 | 55809.007792 | 53206334.0 | 35.713622 | ... | 63.078571 | 79.6 | Washington D.C. | 12222.325032 | N.A. | 16.778037 | 2.836811 | 9.411019 | 10.929691 | winter_2 |
| 769 | Uruguay | 12.0 | 36.0 | 17962.0 | 168.0 | 3461734.0 | 9.296212 | 14597.308096 | 517226.0 | 19.708028 | ... | 53.426552 | 83.8 | Montevideo | 19011.570501 | SA | 9.796014 | 2.229607 | 9.852803 | 9.588592 | winter_2 |
| 780 | Vietnam | 12.0 | 20.0 | 1456.0 | 35.0 | 96462106.0 | 5.532128 | 2082.243694 | 7286432.0 | 308.125246 | ... | 56.821429 | 30.8 | Hanoi | 1355.430475 | AS | 7.283448 | 1.710573 | 7.211874 | 7.641201 | winter_2 |
| 791 | Zambia | 12.0 | 35.0 | 20462.0 | 386.0 | 17861030.0 | 4.470341 | 1658.135434 | 377817.0 | 23.341479 | ... | 53.677391 | 50.9 | Lusaka | 10501.353873 | AF | 9.926325 | 1.497465 | 9.259259 | 7.413449 | winter_2 |
72 rows × 29 columns
from scipy.stats.stats import pearsonr
corr_error = {}
for column in vars_to_use:
if not isinstance(df_newest[column].iloc[0], str):
corr_test = pearsonr(df_newest[column], results_total.resid)
corr_error[column] = {'correlation': corr_test[0].round(4), 'p-value': corr_test[1].round(4)}
print(f'Variable: {column} --- correlation: {corr_test[0]:.4f}, p-value: {corr_test[1]:.4f}')
Variable: individualism_index --- correlation: -0.0000, p-value: 1.0000 Variable: gdp_per_capita --- correlation: -0.0000, p-value: 1.0000 Variable: population_density --- correlation: -0.0000, p-value: 1.0000 Variable: mobility_index --- correlation: 0.0115, p-value: 0.9235 Variable: containment_health_index --- correlation: 0.0567, p-value: 0.6363 Variable: democracy_index --- correlation: -0.0000, p-value: 1.0000 Variable: distance_log --- correlation: -0.0000, p-value: 1.0000 Variable: cases_neighbors --- correlation: 0.0000, p-value: 1.0000
vars_to_use = ['individualism_index', 'gdp_per_capita',
'population_density',
'mobility_index', 'containment_health_index', 'democracy_index', 'distance_log',
'cases_neighbors']
corr_error = {}
for season, model in season_vars.items():
corr_error[season] = {}
for season, model in season_vars.items():
for column in vars_to_use:
if season == 'cumulative': df_ = df_newest2[vars_to_use].copy()
else: df_ = df_seasons2.loc[df_seasons2.season == season, vars_to_use].copy()
if not isinstance(df_[column].iloc[0], str):
corr_test = pearsonr(df_[column], model.resid)
corr_error[season][column] = corr_test[0].round(4)
corr_error = pd.DataFrame(corr_error)
with open('corr_error.tex','w') as tf:
tf.write(corr_error.to_latex())
There is no relationship between any of the predictors and the error term.
fitted_vals = results_total.predict()
resids = results_total.resid
y = df_newest['confirmed_log']
df_newest[df_newest.index == 780]
| country | month | idv | confirmed | dead | population | current_health_expenditure_of_gdp | gdp_per_capita_constant_2010_us | population_ages_65_and_above_total | population_density_people_per_sq_km_of_land_area | ... | containment_health_index | democracy_index | capital | distance | continent | confirmed_log | current_health_expenditure_of_gdp_log | distance_log | gdp_per_capita_constant_2010_us_log | season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 780 | Vietnam | 12.0 | 20.0 | 1456.0 | 35.0 | 96462106.0 | 5.532128 | 2082.243694 | 7286432.0 | 308.125246 | ... | 56.821429 | 30.8 | Hanoi | 1355.430475 | AS | 7.283448 | 1.710573 | 7.211874 | 7.641201 | winter_2 |
1 rows × 29 columns
results_total2 = smf.ols('confirmed_log ~ ' + ' + '.join(vars_to_use), data = df_newest[df_newest.country != 'Vietnam']).fit()
results_total2.summary()
| Dep. Variable: | confirmed_log | R-squared: | 0.474 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.397 |
| Method: | Least Squares | F-statistic: | 6.119 |
| Date: | Sun, 03 Jan 2021 | Prob (F-statistic): | 4.34e-06 |
| Time: | 14:17:02 | Log-Likelihood: | -125.63 |
| No. Observations: | 71 | AIC: | 271.3 |
| Df Residuals: | 61 | BIC: | 293.9 |
| Df Model: | 9 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 9.7351 | 3.794 | 2.566 | 0.013 | 2.149 | 17.322 |
| idv | 0.0194 | 0.012 | 1.578 | 0.120 | -0.005 | 0.044 |
| current_health_expenditure_of_gdp | 0.3192 | 0.111 | 2.869 | 0.006 | 0.097 | 0.542 |
| gdp_per_capita_constant_2010_us | -1.751e-05 | 1.23e-05 | -1.424 | 0.160 | -4.21e-05 | 7.09e-06 |
| over65_per_capita | -2.4652 | 4.478 | -0.550 | 0.584 | -11.420 | 6.490 |
| population_density_people_per_sq_km_of_land_area | -6.924e-05 | 0.000 | -0.324 | 0.747 | -0.000 | 0.000 |
| mobility_index | 0.0020 | 0.022 | 0.092 | 0.927 | -0.042 | 0.046 |
| containment_health_index | 0.1059 | 0.020 | 5.413 | 0.000 | 0.067 | 0.145 |
| democracy_index | -0.0412 | 0.015 | -2.668 | 0.010 | -0.072 | -0.010 |
| distance_log | -0.3605 | 0.410 | -0.879 | 0.383 | -1.180 | 0.459 |
| Omnibus: | 1.090 | Durbin-Watson: | 1.681 |
|---|---|---|---|
| Prob(Omnibus): | 0.580 | Jarque-Bera (JB): | 0.795 |
| Skew: | -0.259 | Prob(JB): | 0.672 |
| Kurtosis: | 3.028 | Cond. No. | 9.20e+05 |
results_total.summary()
| Dep. Variable: | confirmed_log | R-squared: | 0.409 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.323 |
| Method: | Least Squares | F-statistic: | 4.767 |
| Date: | Sun, 03 Jan 2021 | Prob (F-statistic): | 7.78e-05 |
| Time: | 14:18:00 | Log-Likelihood: | -134.24 |
| No. Observations: | 72 | AIC: | 288.5 |
| Df Residuals: | 62 | BIC: | 311.3 |
| Df Model: | 9 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 3.8786 | 3.791 | 1.023 | 0.310 | -3.699 | 11.456 |
| idv | 0.0200 | 0.013 | 1.484 | 0.143 | -0.007 | 0.047 |
| current_health_expenditure_of_gdp | 0.2310 | 0.119 | 1.935 | 0.058 | -0.008 | 0.470 |
| gdp_per_capita_constant_2010_us | -1.678e-05 | 1.35e-05 | -1.242 | 0.219 | -4.38e-05 | 1.02e-05 |
| over65_per_capita | -0.4140 | 4.882 | -0.085 | 0.933 | -10.173 | 9.345 |
| population_density_people_per_sq_km_of_land_area | -1.116e-05 | 0.000 | -0.048 | 0.962 | -0.000 | 0.000 |
| mobility_index | 0.0078 | 0.024 | 0.325 | 0.747 | -0.040 | 0.056 |
| containment_health_index | 0.1071 | 0.021 | 4.986 | 0.000 | 0.064 | 0.150 |
| democracy_index | -0.0316 | 0.017 | -1.887 | 0.064 | -0.065 | 0.002 |
| distance_log | 0.2459 | 0.413 | 0.595 | 0.554 | -0.580 | 1.072 |
| Omnibus: | 6.645 | Durbin-Watson: | 1.531 |
|---|---|---|---|
| Prob(Omnibus): | 0.036 | Jarque-Bera (JB): | 5.971 |
| Skew: | -0.578 | Prob(JB): | 0.0505 |
| Kurtosis: | 3.810 | Cond. No. | 8.89e+05 |
get_plot(results_total2, df_newest[df_newest.country != 'Vietnam'])
Results do not change after checking for outliers
fig, axs = plt.subplots(3,2, figsize = (15,12))
ax_list = [item for sublist in axs for item in sublist]
for season, model in season_vars.items():
ax = ax_list.pop(0)
sm.graphics.influence_plot(model, criterion = 'cooks', ax=ax, size = 0.1, alpha = 0.001)
ax.set_title(f"Model: {season}", fontsize = 16)
plt.tight_layout()
fig.savefig('leverage.png')
results_total3 = smf.ols('confirmed_log ~ ' + ' + '.join(vars_to_use), data = df_newest[df_newest.country != 'Singapore']).fit()
get_plot(results_total3, df_newest[df_newest.country != 'Singapore'])
results_total3.summary()
| Dep. Variable: | confirmed_log | R-squared: | 0.425 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.340 |
| Method: | Least Squares | F-statistic: | 5.009 |
| Date: | Sun, 03 Jan 2021 | Prob (F-statistic): | 4.72e-05 |
| Time: | 14:26:54 | Log-Likelihood: | -131.72 |
| No. Observations: | 71 | AIC: | 283.4 |
| Df Residuals: | 61 | BIC: | 306.1 |
| Df Model: | 9 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 4.9336 | 3.832 | 1.287 | 0.203 | -2.730 | 12.597 |
| idv | 0.0196 | 0.013 | 1.466 | 0.148 | -0.007 | 0.046 |
| current_health_expenditure_of_gdp | 0.2373 | 0.118 | 2.002 | 0.050 | 0.000 | 0.474 |
| gdp_per_capita_constant_2010_us | -1.985e-05 | 1.36e-05 | -1.463 | 0.149 | -4.7e-05 | 7.29e-06 |
| over65_per_capita | -0.2135 | 4.844 | -0.044 | 0.965 | -9.900 | 9.473 |
| population_density_people_per_sq_km_of_land_area | -0.0013 | 0.001 | -1.391 | 0.169 | -0.003 | 0.001 |
| mobility_index | 0.0169 | 0.025 | 0.688 | 0.494 | -0.032 | 0.066 |
| containment_health_index | 0.1160 | 0.022 | 5.224 | 0.000 | 0.072 | 0.160 |
| democracy_index | -0.0278 | 0.017 | -1.656 | 0.103 | -0.061 | 0.006 |
| distance_log | 0.0842 | 0.425 | 0.198 | 0.844 | -0.766 | 0.934 |
| Omnibus: | 5.912 | Durbin-Watson: | 1.469 |
|---|---|---|---|
| Prob(Omnibus): | 0.052 | Jarque-Bera (JB): | 5.131 |
| Skew: | -0.542 | Prob(JB): | 0.0769 |
| Kurtosis: | 3.746 | Cond. No. | 8.71e+05 |
Singapore, the point with high leverage, does not change any of the conclusions reached
from scipy import stats
def normality_of_residuals_test(model):
'''
Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to
investigate the normality of residuals.
Arg:
* model - fitted OLS models from statsmodels
'''
sm.ProbPlot(model.resid).qqplot(line='s');
plt.title('Q-Q plot');
jb = stats.jarque_bera(model.resid)
sw = stats.shapiro(model.resid)
ad = stats.anderson(model.resid, dist='norm')
ks = stats.kstest(model.resid, 'norm')
print(f'Jarque-Bera test ---- statistic: {jb[0]:.4f}, p-value: {jb[1]}')
print(f'Shapiro-Wilk test ---- statistic: {sw[0]:.4f}, p-value: {sw[1]:.4f}')
print(f'Kolmogorov-Smirnov test ---- statistic: {ks.statistic:.4f}, p-value: {ks.pvalue:.4f}')
print(f'Anderson-Darling test ---- statistic: {ad.statistic:.4f}, 5% critical value: {ad.critical_values[2]:.4f}')
print('If the returned AD statistic is larger than the critical value, then for the 5% significance level, the null hypothesis that the data come from the Normal distribution should be rejected. ')
fig, axs = plt.subplots(3,2, figsize = (15,12))
ax_list = [item for sublist in axs for item in sublist]
for season, model in season_vars.items():
ax = ax_list.pop(0)
sm.ProbPlot(model.resid).qqplot(line='s', ax=ax);
ax.set_title(f"Model: {season}", fontsize = 16)
plt.tight_layout()
fig.savefig('normality_tests.png')
fig, axs = plt.subplots(3,2, figsize = (15,12))
ax_list = [item for sublist in axs for item in sublist]
for season, model in season_vars.items():
ax = ax_list.pop(0)
sm.ProbPlot(model.resid).qqplot(line='s', ax=ax);
ax.set_title(f"Model: {season}", fontsize = 16)
plt.tight_layout()
fig.savefig('normality_tests.png')
tests = {}
for season, model in season_vars.items():
jb = stats.jarque_bera(model.resid)
sw = stats.shapiro(model.resid)
ad = stats.anderson(model.resid, dist='norm')
ks = stats.kstest(model.resid, 'norm')
tests[season] = {'Jarque-Bera test': jb[1],
'Shapiro-Wilk test': sw[1],
'Kolmogorov-Smirnov test': ks.pvalue}
tests = pd.DataFrame(tests).round(4)
with open('tests_norm.tex','w') as tf:
tf.write(tests.to_latex())
normality_of_residuals_test(results_total)
Jarque-Bera test ---- statistic: 22.2804, p-value: 1.4517044101491905e-05 Shapiro-Wilk test ---- statistic: 0.9489, p-value: 0.0055 Kolmogorov-Smirnov test ---- statistic: 0.1108, p-value: 0.3161 Anderson-Darling test ---- statistic: 0.8127, 5% critical value: 0.7490 If the returned AD statistic is larger than the critical value, then for the 5% significance level, the null hypothesis that the data come from the Normal distribution should be rejected.
def y_model(ind, cont):
return 10 + 0.048 * ind + 0.211 * cont - 0.001* (ind*cont)
y_model(10, 30)
16.51
for cont in [20,50,80]:
print()
print()
print(f"------- CONT: {cont}----------------------")
for ind in range(5,100,5):
print(f"Individualism: {ind}, Results: {y_model(ind, cont)}")
------- CONT: 20---------------------- Individualism: 5, Results: 14.360000000000001 Individualism: 10, Results: 14.5 Individualism: 15, Results: 14.64 Individualism: 20, Results: 14.78 Individualism: 25, Results: 14.919999999999998 Individualism: 30, Results: 15.06 Individualism: 35, Results: 15.2 Individualism: 40, Results: 15.34 Individualism: 45, Results: 15.479999999999999 Individualism: 50, Results: 15.620000000000001 Individualism: 55, Results: 15.76 Individualism: 60, Results: 15.899999999999999 Individualism: 65, Results: 16.04 Individualism: 70, Results: 16.18 Individualism: 75, Results: 16.32 Individualism: 80, Results: 16.459999999999997 Individualism: 85, Results: 16.6 Individualism: 90, Results: 16.74 Individualism: 95, Results: 16.880000000000003 ------- CONT: 50---------------------- Individualism: 5, Results: 20.54 Individualism: 10, Results: 20.53 Individualism: 15, Results: 20.52 Individualism: 20, Results: 20.509999999999998 Individualism: 25, Results: 20.5 Individualism: 30, Results: 20.49 Individualism: 35, Results: 20.479999999999997 Individualism: 40, Results: 20.47 Individualism: 45, Results: 20.46 Individualism: 50, Results: 20.45 Individualism: 55, Results: 20.439999999999998 Individualism: 60, Results: 20.43 Individualism: 65, Results: 20.42 Individualism: 70, Results: 20.409999999999997 Individualism: 75, Results: 20.4 Individualism: 80, Results: 20.39 Individualism: 85, Results: 20.38 Individualism: 90, Results: 20.369999999999997 Individualism: 95, Results: 20.36 ------- CONT: 80---------------------- Individualism: 5, Results: 26.72 Individualism: 10, Results: 26.56 Individualism: 15, Results: 26.400000000000002 Individualism: 20, Results: 26.24 Individualism: 25, Results: 26.08 Individualism: 30, Results: 25.92 Individualism: 35, Results: 25.759999999999998 Individualism: 40, Results: 25.599999999999998 Individualism: 45, Results: 25.439999999999998 Individualism: 50, Results: 25.28 Individualism: 55, Results: 25.119999999999997 Individualism: 60, Results: 24.959999999999997 Individualism: 65, Results: 24.8 Individualism: 70, Results: 24.639999999999997 Individualism: 75, Results: 24.479999999999997 Individualism: 80, Results: 24.32 Individualism: 85, Results: 24.16 Individualism: 90, Results: 24.0 Individualism: 95, Results: 23.839999999999996
df = pd.read_csv('2021-01-04 final_data.csv')
df = df[df.country != 'Tanzania']
fig, axs = plt.subplots(3,4, figsize = (20,15))
ax_list = [item for sublist in axs for item in sublist]
for month in range(2,13):
ax = ax_list.pop(0)
df_ = df[df.month == month]
sns.regplot(x = 'individualism_index', y = 'containment_health_index', data = df_, ax=ax)
ax.set_title(f"Month: {month}")
fig, axs = plt.subplots(3,4, figsize = (20,15))
ax_list = [item for sublist in axs for item in sublist]
for month in range(2,13):
ax = ax_list.pop(0)
df_ = df[df.month == month]
sns.regplot(x = 'individualism_index', y = 'containment_health_index', data = df_, ax=ax, order = 2)
ax.set_title(f"Month: {month}")
fig, axs = plt.subplots(3,4, figsize = (20,15))
ax_list = [item for sublist in axs for item in sublist]
for month in range(2,13):
ax = ax_list.pop(0)
df_ = df[df.month == month]
sns.scatterplot(x = 'individualism_index', y = 'containment_health_index',
hue = 'new_cases_per_100k_log',
size ='new_cases_per_100k_log',
data = df_, ax=ax)
ax.set_title(f"Month: {month}")
df_newest = df[df.month==12]
df_newest = df_newest.set_index('country')
df_newest['mobility_index_avg'] = df.groupby('country').mobility_index.mean()
df_newest['containment_health_index_avg'] = df.groupby('country').containment_health_index.mean()
df_newest = get_interactions(df_newest, month_12=True)
df_newest['confirmed_per_100k'] = (df_newest['confirmed'] / df_newest['population'] ) * 100_000
df_newest['confirmed_per_100k_log'] = np.log(df_newest['confirmed_per_100k'])
fig, ax = plt.subplots(figsize=(12,7))
sns.scatterplot(x = 'individualism_index', y = 'containment_health_index_avg',
hue = 'confirmed_per_100k_log',
size ='confirmed_per_100k_log',
data = df_newest, ax=ax);
fig, axs = plt.subplots(3,4, figsize = (20,15))
ax_list = [item for sublist in axs for item in sublist]
for month in range(2,13):
ax = ax_list.pop(0)
df_ = df[df.month == month]
sns.scatterplot(x = 'new_cases_per_100k_log', y = 'containment_health_index',
size ='individualism_index',
data = df_, ax=ax)
ax.set_title(f"Month: {month}")
sns.regplot(x = 'individualism_index', y = 'containment_health_index_avg', data = df_newest, order = 2)
<AxesSubplot:xlabel='individualism_index', ylabel='containment_health_index_avg'>
import scipy.stats as st
def checks_input_mcar_tests(data):
""" Checks whether the input parameter of class McarTests is correct
Parameters
----------
data:
The input of McarTests specified as 'data'
Returns
-------
bool
True if input is correct
"""
if not isinstance(data, pd.DataFrame):
print("Error: Data should be a Pandas DataFrame")
return False
if not any(data.dtypes.values == np.float):
if not any(data.dtypes.values == np.int):
print("Error: Dataset cannot contain other value types than floats and/or integers")
return False
if not data.isnull().values.any():
print("Error: No NaN's in given data")
return False
return True
class McarTests():
def __init__(self, data):
self.data = data
def mcar_test(self):
""" Implementation of Little's MCAR test
Parameters
----------
data: Pandas DataFrame
An incomplete dataset with samples as index and variables as columns
Returns
-------
p_value: Float
This value is the outcome of a chi-square statistical test, testing whether the null hypothesis
'the missingness mechanism of the incomplete dataset is MCAR' can be rejected.
"""
if not checks_input_mcar_tests(self.data):
raise Exception("Input not correct")
dataset = self.data.copy()
vars = dataset.dtypes.index.values
n_var = dataset.shape[1]
# mean and covariance estimates
# ideally, this is done with a maximum likelihood estimator
gmean = dataset.mean()
gcov = dataset.cov()
# set up missing data patterns
r = 1 * dataset.isnull()
mdp = np.dot(r, list(map(lambda x: ma.pow(2, x), range(n_var))))
sorted_mdp = sorted(np.unique(mdp))
n_pat = len(sorted_mdp)
correct_mdp = list(map(lambda x: sorted_mdp.index(x), mdp))
dataset['mdp'] = pd.Series(correct_mdp, index=dataset.index)
# calculate statistic and df
pj = 0
d2 = 0
for i in range(n_pat):
dataset_temp = dataset.loc[dataset['mdp'] == i, vars]
select_vars = ~dataset_temp.isnull().any()
pj += np.sum(select_vars)
select_vars = vars[select_vars]
means = dataset_temp[select_vars].mean() - gmean[select_vars]
select_cov = gcov.loc[select_vars, select_vars]
mj = len(dataset_temp)
parta = np.dot(means.T, np.linalg.solve(select_cov, np.identity(select_cov.shape[1])))
d2 += mj * (np.dot(parta, means))
df = pj - n_var
# perform test and save output
p_value = 1 - st.chi2.cdf(d2, df)
return p_value
def mcar_t_tests(self):
""" MCAR tests for each pair of variables
Parameters
----------
data: Pandas DataFrame
An incomplete dataset with samples as index and variables as columns
Returns
-------
mcar_matrix: Pandas DataFrame
A square Pandas DataFrame containing True/False for each pair of variables
True: Missingness in index variable is MCAR for column variable
False: Missingness in index variable is not MCAR for column variable
"""
if not checks_input_mcar_tests(self.data):
raise Exception("Input not correct")
dataset = self.data.copy()
vars = dataset.dtypes.index.values
mcar_matrix = pd.DataFrame(data=np.zeros(shape=(dataset.shape[1], dataset.shape[1])),
columns=vars, index=vars)
for var in vars:
for tvar in vars:
part_one = dataset.loc[dataset[var].isnull(), tvar].dropna()
part_two = dataset.loc[~dataset[var].isnull(), tvar].dropna()
mcar_matrix.loc[var, tvar] = st.ttest_ind(part_one, part_two, equal_var=False).pvalue
mcar_matrix = mcar_matrix[mcar_matrix.notnull()] > 0.05
return mcar_matrix
checks_input_mcar_tests(df)
Error: No NaN's in given data
False
df['gdp'] = df['gdp_per_capita'] * df['population']
with open('corr_coef_gdp.tex','w') as tf:
tf.write(df[df.month == 12][['gdp', 'gdp_per_capita', 'individualism_index']].corr().to_latex())
fig, axs = plt.subplots(3,4, figsize = (20,15))
ax_list = [item for sublist in axs for item in sublist]
for month in range(2,13):
ax = ax_list.pop(0)
df_ = df[df.month == month]
sns.regplot(x = 'democracy_index', y = 'containment_health_index', data = df_, ax=ax)
ax.set_title(f"Month: {month}")
df4 = df.groupby(['country']).agg({'containment_health_index': ['std'],
'democracy_index': 'mean'})
sns.regplot(x = 'democracy_index', y = 'containment_health_index', data = df4)
<AxesSubplot:xlabel='mean', ylabel='std'>
df['individualism_index'].mean()
42.309859154929576
plt.hist(df['individualism_index']);
df['ind_category'] = df['individualism_index'].map(lambda x: 'high' if x >= 60 else 'low' if x <= 25 else 'medium')
df['ind_category'].value_counts() / 11
medium 27.0 low 22.0 high 22.0 Name: ind_category, dtype: float64
dfn = df[df.month == 12].copy()
fig, axs = plt.subplots(10,2,figsize=(14,25))
graph_titles = {'new_cases_per_100k_log': 'New cases per 100K people, log',
'containment_health_index': 'Containment Health Index'}
for i in range(10):
for k, var in enumerate(['new_cases_per_100k_log', 'containment_health_index']):
month = int(i+3)
dfnt = df[df.month == month].copy()
sns.kdeplot(dfnt.loc[dfnt.ind_category == 'low', var], fill=True, label = 'Low', bw_adjust = .8, alpha = 0.2, ax=axs[i,k])
sns.kdeplot(dfnt.loc[dfnt.ind_category == 'medium', var], fill=True, label = 'Medium', bw_adjust = .8, alpha = 0.2, ax=axs[i,k])
sns.kdeplot(dfnt.loc[dfnt.ind_category == 'high', var], fill=True, label = 'High', bw_adjust = .8, alpha = 0.2, ax=axs[i,k])
axs[i,k].set_title(f"{graph_titles[var]}, {calendar.month_name[month]}")
axs[i,k].legend(prop={'size': 8})
plt.tight_layout()
fig.savefig('cases_health_dist.png')
import altair as alt
df = pd.read_csv('2021-01-04 final_data.csv')
df = df[df.country != 'Tanzania'].copy()
df = df[df.country != 'Trinidad and Tobago'].copy()
df['individualism category'] = df['individualism_index'].map(lambda x: 'high' if x >= 60 else 'low' if x <= 25 else 'medium')
color_by_variable = alt.Chart(df[df.month != 2]).mark_circle(opacity=0.3).encode(
x=alt.X('new_cases_per_100k_log:Q', axis = alt.Axis(title = 'New cases per 100 thousand people (log)')),
y=alt.Y('containment_health_index', axis = alt.Axis(title = 'Containment health index')),
color=alt.Color('individualism category',
sort = ['low', 'medium', 'high']))
color_by_variable.properties(
) + color_by_variable.transform_regression('new_cases_per_100k_log', 'containment_health_index',
groupby=['individualism category']).mark_line(size=3, opacity = 0.9)